/* * Mesa 3-D graphics library * * Copyright 2003 VMware, Inc. * Copyright 2009 VMware, Inc. * All Rights Reserved. * Copyright (C) 2016 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "main/glheader.h" #include "main/context.h" #include "main/varray.h" #include "main/macros.h" #include "main/sse_minmax.h" #include "x86/common_x86_asm.h" #include "util/hash_table.h" struct minmax_cache_key { GLintptr offset; GLuint count; unsigned index_size; }; struct minmax_cache_entry { struct minmax_cache_key key; GLuint min; GLuint max; }; static uint32_t vbo_minmax_cache_hash(const struct minmax_cache_key *key) { return _mesa_hash_data(key, sizeof(*key)); } static bool vbo_minmax_cache_key_equal(const struct minmax_cache_key *a, const struct minmax_cache_key *b) { return (a->offset == b->offset) && (a->count == b->count) && (a->index_size == b->index_size); } static void vbo_minmax_cache_delete_entry(struct hash_entry *entry) { free(entry->data); } static GLboolean vbo_use_minmax_cache(struct gl_buffer_object *bufferObj) { if (bufferObj->UsageHistory & (USAGE_TEXTURE_BUFFER | USAGE_ATOMIC_COUNTER_BUFFER | USAGE_SHADER_STORAGE_BUFFER | USAGE_TRANSFORM_FEEDBACK_BUFFER | USAGE_PIXEL_PACK_BUFFER | USAGE_DISABLE_MINMAX_CACHE)) return GL_FALSE; if ((bufferObj->Mappings[MAP_USER].AccessFlags & (GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT)) == (GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT)) return GL_FALSE; return GL_TRUE; } void vbo_delete_minmax_cache(struct gl_buffer_object *bufferObj) { _mesa_hash_table_destroy(bufferObj->MinMaxCache, vbo_minmax_cache_delete_entry); bufferObj->MinMaxCache = NULL; } static GLboolean vbo_get_minmax_cached(struct gl_buffer_object *bufferObj, unsigned index_size, GLintptr offset, GLuint count, GLuint *min_index, GLuint *max_index) { GLboolean found = GL_FALSE; struct minmax_cache_key key; uint32_t hash; struct hash_entry *result; if (!bufferObj->MinMaxCache) return GL_FALSE; if (!vbo_use_minmax_cache(bufferObj)) return GL_FALSE; mtx_lock(&bufferObj->Mutex); if (bufferObj->MinMaxCacheDirty) { /* Disable the cache permanently for this BO if the number of hits * is asymptotically less than the number of misses. This happens when * applications use the BO for streaming. * * However, some initial optimism allows applications that interleave * draw calls with glBufferSubData during warmup. */ unsigned optimism = bufferObj->Size; if (bufferObj->MinMaxCacheMissIndices > optimism && bufferObj->MinMaxCacheHitIndices < bufferObj->MinMaxCacheMissIndices - optimism) { bufferObj->UsageHistory |= USAGE_DISABLE_MINMAX_CACHE; vbo_delete_minmax_cache(bufferObj); goto out_disable; } _mesa_hash_table_clear(bufferObj->MinMaxCache, vbo_minmax_cache_delete_entry); bufferObj->MinMaxCacheDirty = false; goto out_invalidate; } key.index_size = index_size; key.offset = offset; key.count = count; hash = vbo_minmax_cache_hash(&key); result = _mesa_hash_table_search_pre_hashed(bufferObj->MinMaxCache, hash, &key); if (result) { struct minmax_cache_entry *entry = result->data; *min_index = entry->min; *max_index = entry->max; found = GL_TRUE; } out_invalidate: if (found) { /* The hit counter saturates so that we don't accidently disable the * cache in a long-running program. */ unsigned new_hit_count = bufferObj->MinMaxCacheHitIndices + count; if (new_hit_count >= bufferObj->MinMaxCacheHitIndices) bufferObj->MinMaxCacheHitIndices = new_hit_count; else bufferObj->MinMaxCacheHitIndices = ~(unsigned)0; } else { bufferObj->MinMaxCacheMissIndices += count; } out_disable: mtx_unlock(&bufferObj->Mutex); return found; } static void vbo_minmax_cache_store(struct gl_context *ctx, struct gl_buffer_object *bufferObj, unsigned index_size, GLintptr offset, GLuint count, GLuint min, GLuint max) { struct minmax_cache_entry *entry; struct hash_entry *table_entry; uint32_t hash; if (!vbo_use_minmax_cache(bufferObj)) return; mtx_lock(&bufferObj->Mutex); if (!bufferObj->MinMaxCache) { bufferObj->MinMaxCache = _mesa_hash_table_create(NULL, (uint32_t (*)(const void *))vbo_minmax_cache_hash, (bool (*)(const void *, const void *))vbo_minmax_cache_key_equal); if (!bufferObj->MinMaxCache) goto out; } entry = MALLOC_STRUCT(minmax_cache_entry); if (!entry) goto out; entry->key.offset = offset; entry->key.count = count; entry->key.index_size = index_size; entry->min = min; entry->max = max; hash = vbo_minmax_cache_hash(&entry->key); table_entry = _mesa_hash_table_search_pre_hashed(bufferObj->MinMaxCache, hash, &entry->key); if (table_entry) { /* It seems like this could happen when two contexts are rendering using * the same buffer object from multiple threads. */ _mesa_debug(ctx, "duplicate entry in minmax cache\n"); free(entry); goto out; } table_entry = _mesa_hash_table_insert_pre_hashed(bufferObj->MinMaxCache, hash, &entry->key, entry); if (!table_entry) free(entry); out: mtx_unlock(&bufferObj->Mutex); } /** * Compute min and max elements by scanning the index buffer for * glDraw[Range]Elements() calls. * If primitive restart is enabled, we need to ignore restart * indexes when computing min/max. */ static void vbo_get_minmax_index(struct gl_context *ctx, const struct _mesa_prim *prim, const struct _mesa_index_buffer *ib, GLuint *min_index, GLuint *max_index, const GLuint count) { const GLboolean restart = ctx->Array._PrimitiveRestart; const GLuint restartIndex = _mesa_primitive_restart_index(ctx, ib->index_size); const char *indices; GLuint i; indices = (char *) ib->ptr + prim->start * ib->index_size; if (_mesa_is_bufferobj(ib->obj)) { GLsizeiptr size = MIN2(count * ib->index_size, ib->obj->Size); if (vbo_get_minmax_cached(ib->obj, ib->index_size, (GLintptr) indices, count, min_index, max_index)) return; indices = ctx->Driver.MapBufferRange(ctx, (GLintptr) indices, size, GL_MAP_READ_BIT, ib->obj, MAP_INTERNAL); } switch (ib->index_size) { case 4: { const GLuint *ui_indices = (const GLuint *)indices; GLuint max_ui = 0; GLuint min_ui = ~0U; if (restart) { for (i = 0; i < count; i++) { if (ui_indices[i] != restartIndex) { if (ui_indices[i] > max_ui) max_ui = ui_indices[i]; if (ui_indices[i] < min_ui) min_ui = ui_indices[i]; } } } else { #if defined(USE_SSE41) if (cpu_has_sse4_1) { _mesa_uint_array_min_max(ui_indices, &min_ui, &max_ui, count); } else #endif for (i = 0; i < count; i++) { if (ui_indices[i] > max_ui) max_ui = ui_indices[i]; if (ui_indices[i] < min_ui) min_ui = ui_indices[i]; } } *min_index = min_ui; *max_index = max_ui; break; } case 2: { const GLushort *us_indices = (const GLushort *)indices; GLuint max_us = 0; GLuint min_us = ~0U; if (restart) { for (i = 0; i < count; i++) { if (us_indices[i] != restartIndex) { if (us_indices[i] > max_us) max_us = us_indices[i]; if (us_indices[i] < min_us) min_us = us_indices[i]; } } } else { for (i = 0; i < count; i++) { if (us_indices[i] > max_us) max_us = us_indices[i]; if (us_indices[i] < min_us) min_us = us_indices[i]; } } *min_index = min_us; *max_index = max_us; break; } case 1: { const GLubyte *ub_indices = (const GLubyte *)indices; GLuint max_ub = 0; GLuint min_ub = ~0U; if (restart) { for (i = 0; i < count; i++) { if (ub_indices[i] != restartIndex) { if (ub_indices[i] > max_ub) max_ub = ub_indices[i]; if (ub_indices[i] < min_ub) min_ub = ub_indices[i]; } } } else { for (i = 0; i < count; i++) { if (ub_indices[i] > max_ub) max_ub = ub_indices[i]; if (ub_indices[i] < min_ub) min_ub = ub_indices[i]; } } *min_index = min_ub; *max_index = max_ub; break; } default: unreachable("not reached"); } if (_mesa_is_bufferobj(ib->obj)) { vbo_minmax_cache_store(ctx, ib->obj, ib->index_size, prim->start, count, *min_index, *max_index); ctx->Driver.UnmapBuffer(ctx, ib->obj, MAP_INTERNAL); } } /** * Compute min and max elements for nr_prims */ void vbo_get_minmax_indices(struct gl_context *ctx, const struct _mesa_prim *prims, const struct _mesa_index_buffer *ib, GLuint *min_index, GLuint *max_index, GLuint nr_prims) { GLuint tmp_min, tmp_max; GLuint i; GLuint count; *min_index = ~0; *max_index = 0; for (i = 0; i < nr_prims; i++) { const struct _mesa_prim *start_prim; start_prim = &prims[i]; count = start_prim->count; /* Do combination if possible to reduce map/unmap count */ while ((i + 1 < nr_prims) && (prims[i].start + prims[i].count == prims[i+1].start)) { count += prims[i+1].count; i++; } vbo_get_minmax_index(ctx, start_prim, ib, &tmp_min, &tmp_max, count); *min_index = MIN2(*min_index, tmp_min); *max_index = MAX2(*max_index, tmp_max); } } > 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145
/**************************************************************************
 *
 * Copyright 2011 Marek Olšák <maraeo@gmail.com>
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 **************************************************************************/

#include "util/u_vbuf.h"

#include "util/u_dump.h"
#include "util/u_format.h"
#include "util/u_inlines.h"
#include "util/u_memory.h"
#include "util/u_upload_mgr.h"
#include "translate/translate.h"
#include "translate/translate_cache.h"
#include "cso_cache/cso_cache.h"
#include "cso_cache/cso_hash.h"

struct u_vbuf_elements {
   unsigned count;
   struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];

   unsigned src_format_size[PIPE_MAX_ATTRIBS];

   /* If (velem[i].src_format != native_format[i]), the vertex buffer
    * referenced by the vertex element cannot be used for rendering and
    * its vertex data must be translated to native_format[i]. */
   enum pipe_format native_format[PIPE_MAX_ATTRIBS];
   unsigned native_format_size[PIPE_MAX_ATTRIBS];

   /* This might mean two things:
    * - src_format != native_format, as discussed above.
    * - src_offset % 4 != 0 (if the caps don't allow such an offset). */
   uint32_t incompatible_elem_mask; /* each bit describes a corresp. attrib  */
   /* Which buffer has at least one vertex element referencing it
    * incompatible. */
   uint32_t incompatible_vb_mask_any;
   /* Which buffer has all vertex elements referencing it incompatible. */
   uint32_t incompatible_vb_mask_all;
   /* Which buffer has at least one vertex element referencing it
    * compatible. */
   uint32_t compatible_vb_mask_any;
   /* Which buffer has all vertex elements referencing it compatible. */
   uint32_t compatible_vb_mask_all;

   /* Which buffer has at least one vertex element referencing it
    * non-instanced. */
   uint32_t noninstance_vb_mask_any;

   void *driver_cso;
};

enum {
   VB_VERTEX = 0,
   VB_INSTANCE = 1,
   VB_CONST = 2,
   VB_NUM = 3
};

struct u_vbuf {
   struct u_vbuf_caps caps;

   struct pipe_context *pipe;
   struct translate_cache *translate_cache;
   struct cso_cache *cso_cache;
   struct u_upload_mgr *uploader;

   /* This is what was set in set_vertex_buffers.
    * May contain user buffers. */
   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
   unsigned nr_vertex_buffers;

   /* Saved vertex buffers. */
   struct pipe_vertex_buffer vertex_buffer_saved[PIPE_MAX_ATTRIBS];
   unsigned nr_vertex_buffers_saved;

   /* Vertex buffers for the driver.
    * There are no user buffers. */
   struct pipe_vertex_buffer real_vertex_buffer[PIPE_MAX_ATTRIBS];
   int nr_real_vertex_buffers;
   boolean vertex_buffers_dirty;

   /* The index buffer. */
   struct pipe_index_buffer index_buffer;

   /* Vertex elements. */
   struct u_vbuf_elements *ve, *ve_saved;

   /* Vertex elements used for the translate fallback. */
   struct pipe_vertex_element fallback_velems[PIPE_MAX_ATTRIBS];
   /* If non-NULL, this is a vertex element state used for the translate
    * fallback and therefore used for rendering too. */
   boolean using_translate;
   /* The vertex buffer slot index where translated vertices have been
    * stored in. */
   unsigned fallback_vbs[VB_NUM];

   /* Which buffer is a user buffer. */
   uint32_t user_vb_mask; /* each bit describes a corresp. buffer */
   /* Which buffer is incompatible (unaligned). */
   uint32_t incompatible_vb_mask; /* each bit describes a corresp. buffer */
   /* Which buffer has a non-zero stride. */
   uint32_t nonzero_stride_vb_mask; /* each bit describes a corresp. buffer */
};

static void *
u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
                              const struct pipe_vertex_element *attribs);
static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso);


void u_vbuf_get_caps(struct pipe_screen *screen, struct u_vbuf_caps *caps)
{
   caps->format_fixed32 =
      screen->is_format_supported(screen, PIPE_FORMAT_R32_FIXED, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER);

   caps->format_float16 =
      screen->is_format_supported(screen, PIPE_FORMAT_R16_FLOAT, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER);

   caps->format_float64 =
      screen->is_format_supported(screen, PIPE_FORMAT_R64_FLOAT, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER);

   caps->format_norm32 =
      screen->is_format_supported(screen, PIPE_FORMAT_R32_UNORM, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER) &&
      screen->is_format_supported(screen, PIPE_FORMAT_R32_SNORM, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER);

   caps->format_scaled32 =
      screen->is_format_supported(screen, PIPE_FORMAT_R32_USCALED, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER) &&
      screen->is_format_supported(screen, PIPE_FORMAT_R32_SSCALED, PIPE_BUFFER,
                                  0, PIPE_BIND_VERTEX_BUFFER);

   caps->buffer_offset_unaligned =
      !screen->get_param(screen,
                        PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY);

   caps->buffer_stride_unaligned =
      !screen->get_param(screen,
                        PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY);

   caps->velem_src_offset_unaligned =
      !screen->get_param(screen,
                        PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY);

   caps->user_vertex_buffers =
      screen->get_param(screen, PIPE_CAP_USER_VERTEX_BUFFERS);
}

struct u_vbuf *
u_vbuf_create(struct pipe_context *pipe,
              struct u_vbuf_caps *caps)
{
   struct u_vbuf *mgr = CALLOC_STRUCT(u_vbuf);

   mgr->caps = *caps;
   mgr->pipe = pipe;
   mgr->cso_cache = cso_cache_create();
   mgr->translate_cache = translate_cache_create();
   memset(mgr->fallback_vbs, ~0, sizeof(mgr->fallback_vbs));

   mgr->uploader = u_upload_create(pipe, 1024 * 1024, 4,
                                   PIPE_BIND_VERTEX_BUFFER);

   return mgr;
}

/* u_vbuf uses its own caching for vertex elements, because it needs to keep
 * its own preprocessed state per vertex element CSO. */
static struct u_vbuf_elements *
u_vbuf_set_vertex_elements_internal(struct u_vbuf *mgr, unsigned count,
                                    const struct pipe_vertex_element *states)
{
   struct pipe_context *pipe = mgr->pipe;
   unsigned key_size, hash_key;
   struct cso_hash_iter iter;
   struct u_vbuf_elements *ve;
   struct cso_velems_state velems_state;

   /* need to include the count into the stored state data too. */
   key_size = sizeof(struct pipe_vertex_element) * count + sizeof(unsigned);
   velems_state.count = count;
   memcpy(velems_state.velems, states,
          sizeof(struct pipe_vertex_element) * count);
   hash_key = cso_construct_key((void*)&velems_state, key_size);
   iter = cso_find_state_template(mgr->cso_cache, hash_key, CSO_VELEMENTS,
                                  (void*)&velems_state, key_size);

   if (cso_hash_iter_is_null(iter)) {
      struct cso_velements *cso = MALLOC_STRUCT(cso_velements);
      memcpy(&cso->state, &velems_state, key_size);
      cso->data = u_vbuf_create_vertex_elements(mgr, count, states);
      cso->delete_state = (cso_state_callback)u_vbuf_delete_vertex_elements;
      cso->context = (void*)mgr;

      iter = cso_insert_state(mgr->cso_cache, hash_key, CSO_VELEMENTS, cso);
      ve = cso->data;
   } else {
      ve = ((struct cso_velements *)cso_hash_iter_data(iter))->data;
   }

   assert(ve);
   pipe->bind_vertex_elements_state(pipe, ve->driver_cso);
   return ve;
}

void u_vbuf_set_vertex_elements(struct u_vbuf *mgr, unsigned count,
                               const struct pipe_vertex_element *states)
{
   mgr->ve = u_vbuf_set_vertex_elements_internal(mgr, count, states);
}

void u_vbuf_destroy(struct u_vbuf *mgr)
{
   unsigned i;

   mgr->pipe->set_vertex_buffers(mgr->pipe, 0, NULL);

   for (i = 0; i < mgr->nr_vertex_buffers; i++) {
      pipe_resource_reference(&mgr->vertex_buffer[i].buffer, NULL);
   }
   for (i = 0; i < mgr->nr_real_vertex_buffers; i++) {
      pipe_resource_reference(&mgr->real_vertex_buffer[i].buffer, NULL);
   }

   translate_cache_destroy(mgr->translate_cache);
   u_upload_destroy(mgr->uploader);
   cso_cache_delete(mgr->cso_cache);
   FREE(mgr);
}

static void
u_vbuf_translate_buffers(struct u_vbuf *mgr, struct translate_key *key,
                         unsigned vb_mask, unsigned out_vb,
                         int start_vertex, unsigned num_vertices,
                         int start_index, unsigned num_indices, int min_index,
                         boolean unroll_indices)
{
   struct translate *tr;
   struct pipe_transfer *vb_transfer[PIPE_MAX_ATTRIBS] = {0};
   struct pipe_resource *out_buffer = NULL;
   uint8_t *out_map;
   unsigned i, out_offset;

   /* Get a translate object. */
   tr = translate_cache_find(mgr->translate_cache, key);

   /* Map buffers we want to translate. */
   for (i = 0; i < mgr->nr_vertex_buffers; i++) {
      if (vb_mask & (1 << i)) {
         struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[i];
         unsigned offset = vb->buffer_offset + vb->stride * start_vertex;
         uint8_t *map;

         if (vb->user_buffer) {
            map = (uint8_t*)vb->user_buffer + offset;
         } else {
            unsigned size = vb->stride ? num_vertices * vb->stride
                                       : sizeof(double)*4;

            if (offset+size > vb->buffer->width0) {
               size = vb->buffer->width0 - offset;
            }

            map = pipe_buffer_map_range(mgr->pipe, vb->buffer, offset, size,
                                        PIPE_TRANSFER_READ, &vb_transfer[i]);
         }

         /* Subtract min_index so that indexing with the index buffer works. */
         if (unroll_indices) {
            map -= vb->stride * min_index;
         }

         tr->set_buffer(tr, i, map, vb->stride, ~0);
      }
   }

   /* Translate. */
   if (unroll_indices) {
      struct pipe_index_buffer *ib = &mgr->index_buffer;
      struct pipe_transfer *transfer = NULL;
      unsigned offset = ib->offset + start_index * ib->index_size;
      uint8_t *map;

      assert((ib->buffer || ib->user_buffer) && ib->index_size);

      if (ib->user_buffer) {
         map = (uint8_t*)ib->user_buffer + offset;
      } else {
         map = pipe_buffer_map_range(mgr->pipe, ib->buffer, offset,
                                     num_indices * ib->index_size,
                                     PIPE_TRANSFER_READ, &transfer);
      }

      /* Create and map the output buffer. */
      u_upload_alloc(mgr->uploader, 0,
                     key->output_stride * num_indices,
                     &out_offset, &out_buffer,
                     (void**)&out_map);

      switch (ib->index_size) {
      case 4:
         tr->run_elts(tr, (unsigned*)map, num_indices, 0, out_map);
         break;
      case 2:
         tr->run_elts16(tr, (uint16_t*)map, num_indices, 0, out_map);
         break;
      case 1:
         tr->run_elts8(tr, map, num_indices, 0, out_map);
         break;
      }

      if (transfer) {
         pipe_buffer_unmap(mgr->pipe, transfer);
      }
   } else {
      /* Create and map the output buffer. */
      u_upload_alloc(mgr->uploader,
                     key->output_stride * start_vertex,
                     key->output_stride * num_vertices,
                     &out_offset, &out_buffer,
                     (void**)&out_map);

      out_offset -= key->output_stride * start_vertex;

      tr->run(tr, 0, num_vertices, 0, out_map);
   }

   /* Unmap all buffers. */
   for (i = 0; i < mgr->nr_vertex_buffers; i++) {
      if (vb_transfer[i]) {
         pipe_buffer_unmap(mgr->pipe, vb_transfer[i]);
      }
   }

   /* Setup the new vertex buffer. */
   mgr->real_vertex_buffer[out_vb].buffer_offset = out_offset;
   mgr->real_vertex_buffer[out_vb].stride = key->output_stride;

   /* Move the buffer reference. */
   pipe_resource_reference(
      &mgr->real_vertex_buffer[out_vb].buffer, NULL);
   mgr->real_vertex_buffer[out_vb].buffer = out_buffer;
}

static boolean
u_vbuf_translate_find_free_vb_slots(struct u_vbuf *mgr,
                                    unsigned mask[VB_NUM])
{
   unsigned type;
   unsigned fallback_vbs[VB_NUM];
   /* Set the bit for each buffer which is incompatible, or isn't set. */
   uint32_t unused_vb_mask =
      mgr->ve->incompatible_vb_mask_all | mgr->incompatible_vb_mask |
      ~((1 << mgr->nr_vertex_buffers) - 1);

   memset(fallback_vbs, ~0, sizeof(fallback_vbs));

   /* Find free slots for each type if needed. */
   for (type = 0; type < VB_NUM; type++) {
      if (mask[type]) {
         uint32_t index;

         if (!unused_vb_mask) {
            /* fail, reset the number to its original value */
            mgr->nr_real_vertex_buffers = mgr->nr_vertex_buffers;
            return FALSE;
         }

         index = ffs(unused_vb_mask) - 1;
         fallback_vbs[type] = index;
         if (index >= mgr->nr_real_vertex_buffers) {
            mgr->nr_real_vertex_buffers = index + 1;
         }
         /*printf("found slot=%i for type=%i\n", index, type);*/
      }
   }

   memcpy(mgr->fallback_vbs, fallback_vbs, sizeof(fallback_vbs));
   return TRUE;
}

static boolean
u_vbuf_translate_begin(struct u_vbuf *mgr,
                       int start_vertex, unsigned num_vertices,
                       int start_instance, unsigned num_instances,
                       int start_index, unsigned num_indices, int min_index,
                       boolean unroll_indices)
{
   unsigned mask[VB_NUM] = {0};
   struct translate_key key[VB_NUM];
   unsigned elem_index[VB_NUM][PIPE_MAX_ATTRIBS]; /* ... into key.elements */
   unsigned i, type;

   int start[VB_NUM] = {
      start_vertex,     /* VERTEX */
      start_instance,   /* INSTANCE */
      0                 /* CONST */
   };

   unsigned num[VB_NUM] = {
      num_vertices,     /* VERTEX */
      num_instances,    /* INSTANCE */
      1                 /* CONST */
   };

   memset(key, 0, sizeof(key));
   memset(elem_index, ~0, sizeof(elem_index));

   /* See if there are vertex attribs of each type to translate and
    * which ones. */
   for (i = 0; i < mgr->ve->count; i++) {
      unsigned vb_index = mgr->ve->ve[i].vertex_buffer_index;

      if (!mgr->vertex_buffer[vb_index].stride) {
         if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
             !(mgr->incompatible_vb_mask & (1 << vb_index))) {
            continue;
         }
         mask[VB_CONST] |= 1 << vb_index;
      } else if (mgr->ve->ve[i].instance_divisor) {
         if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
             !(mgr->incompatible_vb_mask & (1 << vb_index))) {
            continue;
         }
         mask[VB_INSTANCE] |= 1 << vb_index;
      } else {
         if (!unroll_indices &&
             !(mgr->ve->incompatible_elem_mask & (1 << i)) &&
             !(mgr->incompatible_vb_mask & (1 << vb_index))) {
            continue;
         }
         mask[VB_VERTEX] |= 1 << vb_index;
      }
   }

   assert(mask[VB_VERTEX] || mask[VB_INSTANCE] || mask[VB_CONST]);

   /* Find free vertex buffer slots. */
   if (!u_vbuf_translate_find_free_vb_slots(mgr, mask)) {
      return FALSE;
   }

   /* Initialize the translate keys. */
   for (i = 0; i < mgr->ve->count; i++) {
      struct translate_key *k;
      struct translate_element *te;
      unsigned bit, vb_index = mgr->ve->ve[i].vertex_buffer_index;
      bit = 1 << vb_index;

      if (!(mgr->ve->incompatible_elem_mask & (1 << i)) &&
          !(mgr->incompatible_vb_mask & (1 << vb_index)) &&
          (!unroll_indices || !(mask[VB_VERTEX] & bit))) {
         continue;
      }

      /* Set type to what we will translate.
       * Whether vertex, instance, or constant attribs. */
      for (type = 0; type < VB_NUM; type++) {
         if (mask[type] & bit) {
            break;
         }
      }
      assert(type < VB_NUM);
      assert(translate_is_output_format_supported(mgr->ve->native_format[i]));
      /*printf("velem=%i type=%i\n", i, type);*/

      /* Add the vertex element. */
      k = &key[type];
      elem_index[type][i] = k->nr_elements;

      te = &k->element[k->nr_elements];
      te->type = TRANSLATE_ELEMENT_NORMAL;
      te->instance_divisor = 0;
      te->input_buffer = vb_index;
      te->input_format = mgr->ve->ve[i].src_format;
      te->input_offset = mgr->ve->ve[i].src_offset;
      te->output_format = mgr->ve->native_format[i];
      te->output_offset = k->output_stride;

      k->output_stride += mgr->ve->native_format_size[i];
      k->nr_elements++;
   }

   /* Translate buffers. */
   for (type = 0; type < VB_NUM; type++) {
      if (key[type].nr_elements) {
         u_vbuf_translate_buffers(mgr, &key[type], mask[type],
                                  mgr->fallback_vbs[type],
                                  start[type], num[type],
                                  start_index, num_indices, min_index,
                                  unroll_indices && type == VB_VERTEX);

         /* Fixup the stride for constant attribs. */
         if (type == VB_CONST) {
            mgr->real_vertex_buffer[mgr->fallback_vbs[VB_CONST]].stride = 0;
         }
      }
   }

   /* Setup new vertex elements. */
   for (i = 0; i < mgr->ve->count; i++) {
      for (type = 0; type < VB_NUM; type++) {
         if (elem_index[type][i] < key[type].nr_elements) {
            struct translate_element *te = &key[type].element[elem_index[type][i]];
            mgr->fallback_velems[i].instance_divisor = mgr->ve->ve[i].instance_divisor;
            mgr->fallback_velems[i].src_format = te->output_format;
            mgr->fallback_velems[i].src_offset = te->output_offset;
            mgr->fallback_velems[i].vertex_buffer_index = mgr->fallback_vbs[type];

            /* elem_index[type][i] can only be set for one type. */
            assert(type > VB_INSTANCE || elem_index[type+1][i] == ~0);
            assert(type > VB_VERTEX   || elem_index[type+2][i] == ~0);
            break;
         }
      }
      /* No translating, just copy the original vertex element over. */
      if (type == VB_NUM) {
         memcpy(&mgr->fallback_velems[i], &mgr->ve->ve[i],
                sizeof(struct pipe_vertex_element));
      }
   }

   u_vbuf_set_vertex_elements_internal(mgr, mgr->ve->count,
                                       mgr->fallback_velems);
   mgr->using_translate = TRUE;
   return TRUE;
}

static void u_vbuf_translate_end(struct u_vbuf *mgr)
{
   unsigned i;

   /* Restore vertex elements. */
   mgr->pipe->bind_vertex_elements_state(mgr->pipe, mgr->ve->driver_cso);
   mgr->using_translate = FALSE;

   /* Unreference the now-unused VBOs. */
   for (i = 0; i < VB_NUM; i++) {
      unsigned vb = mgr->fallback_vbs[i];
      if (vb != ~0) {
         pipe_resource_reference(&mgr->real_vertex_buffer[vb].buffer, NULL);
         mgr->fallback_vbs[i] = ~0;
      }
   }
   mgr->nr_real_vertex_buffers = mgr->nr_vertex_buffers;
}

#define FORMAT_REPLACE(what, withwhat) \
    case PIPE_FORMAT_##what: format = PIPE_FORMAT_##withwhat; break

static void *
u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
                              const struct pipe_vertex_element *attribs)
{
   struct pipe_context *pipe = mgr->pipe;
   unsigned i;
   struct pipe_vertex_element driver_attribs[PIPE_MAX_ATTRIBS];
   struct u_vbuf_elements *ve = CALLOC_STRUCT(u_vbuf_elements);
   uint32_t used_buffers = 0;

   ve->count = count;

   memcpy(ve->ve, attribs, sizeof(struct pipe_vertex_element) * count);
   memcpy(driver_attribs, attribs, sizeof(struct pipe_vertex_element) * count);

   /* Set the best native format in case the original format is not
    * supported. */
   for (i = 0; i < count; i++) {
      enum pipe_format format = ve->ve[i].src_format;

      ve->src_format_size[i] = util_format_get_blocksize(format);

      used_buffers |= 1 << ve->ve[i].vertex_buffer_index;

      if (!ve->ve[i].instance_divisor) {
         ve->noninstance_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
      }

      /* Choose a native format.
       * For now we don't care about the alignment, that's going to
       * be sorted out later. */
      if (!mgr->caps.format_fixed32) {
         switch (format) {
            FORMAT_REPLACE(R32_FIXED,           R32_FLOAT);
            FORMAT_REPLACE(R32G32_FIXED,        R32G32_FLOAT);
            FORMAT_REPLACE(R32G32B32_FIXED,     R32G32B32_FLOAT);
            FORMAT_REPLACE(R32G32B32A32_FIXED,  R32G32B32A32_FLOAT);
            default:;
         }
      }
      if (!mgr->caps.format_float16) {
         switch (format) {
            FORMAT_REPLACE(R16_FLOAT,           R32_FLOAT);
            FORMAT_REPLACE(R16G16_FLOAT,        R32G32_FLOAT);
            FORMAT_REPLACE(R16G16B16_FLOAT,     R32G32B32_FLOAT);
            FORMAT_REPLACE(R16G16B16A16_FLOAT,  R32G32B32A32_FLOAT);
            default:;
         }
      }
      if (!mgr->caps.format_float64) {
         switch (format) {
            FORMAT_REPLACE(R64_FLOAT,           R32_FLOAT);
            FORMAT_REPLACE(R64G64_FLOAT,        R32G32_FLOAT);
            FORMAT_REPLACE(R64G64B64_FLOAT,     R32G32B32_FLOAT);
            FORMAT_REPLACE(R64G64B64A64_FLOAT,  R32G32B32A32_FLOAT);
            default:;
         }
      }
      if (!mgr->caps.format_norm32) {
         switch (format) {
            FORMAT_REPLACE(R32_UNORM,           R32_FLOAT);
            FORMAT_REPLACE(R32G32_UNORM,        R32G32_FLOAT);
            FORMAT_REPLACE(R32G32B32_UNORM,     R32G32B32_FLOAT);
            FORMAT_REPLACE(R32G32B32A32_UNORM,  R32G32B32A32_FLOAT);
            FORMAT_REPLACE(R32_SNORM,           R32_FLOAT);
            FORMAT_REPLACE(R32G32_SNORM,        R32G32_FLOAT);
            FORMAT_REPLACE(R32G32B32_SNORM,     R32G32B32_FLOAT);
            FORMAT_REPLACE(R32G32B32A32_SNORM,  R32G32B32A32_FLOAT);
            default:;
         }
      }
      if (!mgr->caps.format_scaled32) {
         switch (format) {
            FORMAT_REPLACE(R32_USCALED,         R32_FLOAT);
            FORMAT_REPLACE(R32G32_USCALED,      R32G32_FLOAT);
            FORMAT_REPLACE(R32G32B32_USCALED,   R32G32B32_FLOAT);
            FORMAT_REPLACE(R32G32B32A32_USCALED,R32G32B32A32_FLOAT);
            FORMAT_REPLACE(R32_SSCALED,         R32_FLOAT);
            FORMAT_REPLACE(R32G32_SSCALED,      R32G32_FLOAT);
            FORMAT_REPLACE(R32G32B32_SSCALED,   R32G32B32_FLOAT);
            FORMAT_REPLACE(R32G32B32A32_SSCALED,R32G32B32A32_FLOAT);
            default:;
         }
      }

      driver_attribs[i].src_format = format;
      ve->native_format[i] = format;
      ve->native_format_size[i] =
            util_format_get_blocksize(ve->native_format[i]);

      if (ve->ve[i].src_format != format ||
          (!mgr->caps.velem_src_offset_unaligned &&
           ve->ve[i].src_offset % 4 != 0)) {
         ve->incompatible_elem_mask |= 1 << i;
         ve->incompatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
      } else {
         ve->compatible_vb_mask_any |= 1 << ve->ve[i].vertex_buffer_index;
      }
   }

   ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers;
   ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers;

   /* Align the formats to the size of DWORD if needed. */
   if (!mgr->caps.velem_src_offset_unaligned) {
      for (i = 0; i < count; i++) {
         ve->native_format_size[i] = align(ve->native_format_size[i], 4);
      }
   }

   ve->driver_cso =
      pipe->create_vertex_elements_state(pipe, count, driver_attribs);
   return ve;
}

static void u_vbuf_delete_vertex_elements(struct u_vbuf *mgr, void *cso)
{
   struct pipe_context *pipe = mgr->pipe;
   struct u_vbuf_elements *ve = cso;

   pipe->delete_vertex_elements_state(pipe, ve->driver_cso);
   FREE(ve);
}

void u_vbuf_set_vertex_buffers(struct u_vbuf *mgr, unsigned count,
                               const struct pipe_vertex_buffer *bufs)
{
   unsigned i;

   mgr->user_vb_mask = 0;
   mgr->incompatible_vb_mask = 0;
   mgr->nonzero_stride_vb_mask = 0;

   for (i = 0; i < count; i++) {
      const struct pipe_vertex_buffer *vb = &bufs[i];
      struct pipe_vertex_buffer *orig_vb = &mgr->vertex_buffer[i];
      struct pipe_vertex_buffer *real_vb = &mgr->real_vertex_buffer[i];

      pipe_resource_reference(&orig_vb->buffer, vb->buffer);
      orig_vb->user_buffer = vb->user_buffer;

      real_vb->buffer_offset = orig_vb->buffer_offset = vb->buffer_offset;
      real_vb->stride = orig_vb->stride = vb->stride;
      real_vb->user_buffer = NULL;

      if (vb->stride) {
         mgr->nonzero_stride_vb_mask |= 1 << i;
      }

      if (!vb->buffer && !vb->user_buffer) {
         pipe_resource_reference(&real_vb->buffer, NULL);
         continue;
      }

      if ((!mgr->caps.buffer_offset_unaligned && vb->buffer_offset % 4 != 0) ||
          (!mgr->caps.buffer_stride_unaligned && vb->stride % 4 != 0)) {
         mgr->incompatible_vb_mask |= 1 << i;
         pipe_resource_reference(&real_vb->buffer, NULL);
         continue;
      }

      if (!mgr->caps.user_vertex_buffers && vb->user_buffer) {
         mgr->user_vb_mask |= 1 << i;
         pipe_resource_reference(&real_vb->buffer, NULL);
         continue;
      }

      pipe_resource_reference(&real_vb->buffer, vb->buffer);
      real_vb->user_buffer = vb->user_buffer;
   }

   for (i = count; i < mgr->nr_vertex_buffers; i++) {
      pipe_resource_reference(&mgr->vertex_buffer[i].buffer, NULL);
   }
   for (i = count; i < mgr->nr_real_vertex_buffers; i++) {
      pipe_resource_reference(&mgr->real_vertex_buffer[i].buffer, NULL);
   }

   mgr->nr_vertex_buffers = count;
   mgr->nr_real_vertex_buffers = count;
   mgr->vertex_buffers_dirty = TRUE;
}

void u_vbuf_set_index_buffer(struct u_vbuf *mgr,
                             const struct pipe_index_buffer *ib)
{
   struct pipe_context *pipe = mgr->pipe;

   if (ib) {
      assert(ib->offset % ib->index_size == 0);
      pipe_resource_reference(&mgr->index_buffer.buffer, ib->buffer);
      memcpy(&mgr->index_buffer, ib, sizeof(*ib));
   } else {
      pipe_resource_reference(&mgr->index_buffer.buffer, NULL);
   }

   pipe->set_index_buffer(pipe, ib);
}

static void
u_vbuf_upload_buffers(struct u_vbuf *mgr,
                      int start_vertex, unsigned num_vertices,
                      int start_instance, unsigned num_instances)
{
   unsigned i;
   unsigned nr_velems = mgr->ve->count;
   unsigned nr_vbufs = mgr->nr_vertex_buffers;
   struct pipe_vertex_element *velems =
         mgr->using_translate ? mgr->fallback_velems : mgr->ve->ve;
   unsigned start_offset[PIPE_MAX_ATTRIBS];
   unsigned end_offset[PIPE_MAX_ATTRIBS] = {0};

   /* Determine how much data needs to be uploaded. */
   for (i = 0; i < nr_velems; i++) {
      struct pipe_vertex_element *velem = &velems[i];
      unsigned index = velem->vertex_buffer_index;
      struct pipe_vertex_buffer *vb = &mgr->vertex_buffer[index];
      unsigned instance_div, first, size;

      /* Skip the buffers generated by translate. */
      if (index == mgr->fallback_vbs[VB_VERTEX] ||
          index == mgr->fallback_vbs[VB_INSTANCE] ||
          index == mgr->fallback_vbs[VB_CONST]) {
         continue;
      }

      if (!vb->user_buffer) {
         continue;
      }

      instance_div = velem->instance_divisor;
      first = vb->buffer_offset + velem->src_offset;

      if (!vb->stride) {
         /* Constant attrib. */
         size = mgr->ve->src_format_size[i];
      } else if (instance_div) {
         /* Per-instance attrib. */
         unsigned count = (num_instances + instance_div - 1) / instance_div;
         first += vb->stride * start_instance;
         size = vb->stride * (count - 1) + mgr->ve->src_format_size[i];
      } else {
         /* Per-vertex attrib. */
         first += vb->stride * start_vertex;
         size = vb->stride * (num_vertices - 1) + mgr->ve->src_format_size[i];
      }

      /* Update offsets. */
      if (!end_offset[index]) {
         start_offset[index] = first;
         end_offset[index] = first + size;
      } else {
         if (first < start_offset[index])
            start_offset[index] = first;
         if (first + size > end_offset[index])
            end_offset[index] = first + size;
      }
   }

   /* Upload buffers. */
   for (i = 0; i < nr_vbufs; i++) {
      unsigned start, end = end_offset[i];
      struct pipe_vertex_buffer *real_vb;
      const uint8_t *ptr;

      if (!end) {
         continue;
      }

      start = start_offset[i];
      assert(start < end);

      real_vb = &mgr->real_vertex_buffer[i];
      ptr = mgr->vertex_buffer[i].user_buffer;

      u_upload_data(mgr->uploader, start, end - start, ptr + start,
                    &real_vb->buffer_offset, &real_vb->buffer);

      real_vb->buffer_offset -= start;
   }
}

static boolean u_vbuf_need_minmax_index(struct u_vbuf *mgr)
{
   /* See if there are any per-vertex attribs which will be uploaded or
    * translated. Use bitmasks to get the info instead of looping over vertex
    * elements. */
   return ((mgr->user_vb_mask | mgr->incompatible_vb_mask |
            mgr->ve->incompatible_vb_mask_any) &
           mgr->ve->noninstance_vb_mask_any & mgr->nonzero_stride_vb_mask) != 0;
}

static boolean u_vbuf_mapping_vertex_buffer_blocks(struct u_vbuf *mgr)
{
   /* Return true if there are hw buffers which don't need to be translated.
    *
    * We could query whether each buffer is busy, but that would
    * be way more costly than this. */
   return (~mgr->user_vb_mask & ~mgr->incompatible_vb_mask &
           mgr->ve->compatible_vb_mask_all & mgr->ve->noninstance_vb_mask_any &
           mgr->nonzero_stride_vb_mask) != 0;
}

static void u_vbuf_get_minmax_index(struct pipe_context *pipe,
                                    struct pipe_index_buffer *ib,
                                    const struct pipe_draw_info *info,
                                    int *out_min_index,
                                    int *out_max_index)
{
   struct pipe_transfer *transfer = NULL;
   const void *indices;
   unsigned i;
   unsigned restart_index = info->restart_index;

   if (ib->user_buffer) {
      indices = (uint8_t*)ib->user_buffer +
                ib->offset + info->start * ib->index_size;
   } else {
      indices = pipe_buffer_map_range(pipe, ib->buffer,
                                      ib->offset + info->start * ib->index_size,
                                      info->count * ib->index_size,
                                      PIPE_TRANSFER_READ, &transfer);
   }

   switch (ib->index_size) {
   case 4: {
      const unsigned *ui_indices = (const unsigned*)indices;
      unsigned max_ui = 0;
      unsigned min_ui = ~0U;
      if (info->primitive_restart) {
         for (i = 0; i < info->count; i++) {
            if (ui_indices[i] != restart_index) {
               if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
               if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
            }
         }
      }
      else {
         for (i = 0; i < info->count; i++) {
            if (ui_indices[i] > max_ui) max_ui = ui_indices[i];
            if (ui_indices[i] < min_ui) min_ui = ui_indices[i];
         }
      }
      *out_min_index = min_ui;
      *out_max_index = max_ui;
      break;
   }
   case 2: {
      const unsigned short *us_indices = (const unsigned short*)indices;
      unsigned max_us = 0;
      unsigned min_us = ~0U;
      if (info->primitive_restart) {
         for (i = 0; i < info->count; i++) {
            if (us_indices[i] != restart_index) {
               if (us_indices[i] > max_us) max_us = us_indices[i];
               if (us_indices[i] < min_us) min_us = us_indices[i];
            }
         }
      }
      else {
         for (i = 0; i < info->count; i++) {
            if (us_indices[i] > max_us) max_us = us_indices[i];
            if (us_indices[i] < min_us) min_us = us_indices[i];
         }
      }
      *out_min_index = min_us;
      *out_max_index = max_us;
      break;
   }
   case 1: {
      const unsigned char *ub_indices = (const unsigned char*)indices;
      unsigned max_ub = 0;
      unsigned min_ub = ~0U;
      if (info->primitive_restart) {
         for (i = 0; i < info->count; i++) {
            if (ub_indices[i] != restart_index) {
               if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
               if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
            }
         }
      }
      else {
         for (i = 0; i < info->count; i++) {
            if (ub_indices[i] > max_ub) max_ub = ub_indices[i];
            if (ub_indices[i] < min_ub) min_ub = ub_indices[i];
         }
      }
      *out_min_index = min_ub;
      *out_max_index = max_ub;
      break;
   }
   default:
      assert(0);
      *out_min_index = 0;
      *out_max_index = 0;
   }

   if (transfer) {
      pipe_buffer_unmap(pipe, transfer);
   }
}

void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info)
{
   struct pipe_context *pipe = mgr->pipe;
   int start_vertex, min_index;
   unsigned num_vertices;
   boolean unroll_indices = FALSE;
   uint32_t user_vb_mask = mgr->user_vb_mask;

   /* Normal draw. No fallback and no user buffers. */
   if (!mgr->incompatible_vb_mask &&
       !mgr->ve->incompatible_elem_mask &&
       !user_vb_mask) {
      /* Set vertex buffers if needed. */
      if (mgr->vertex_buffers_dirty) {
         pipe->set_vertex_buffers(pipe, mgr->nr_real_vertex_buffers,
                                  mgr->real_vertex_buffer);
         mgr->vertex_buffers_dirty = FALSE;
      }

      pipe->draw_vbo(pipe, info);
      return;
   }

   if (info->indexed) {
      /* See if anything needs to be done for per-vertex attribs. */
      if (u_vbuf_need_minmax_index(mgr)) {
         int max_index;

         if (info->max_index != ~0) {
            min_index = info->min_index;
            max_index = info->max_index;
         } else {
            u_vbuf_get_minmax_index(mgr->pipe, &mgr->index_buffer, info,
                                    &min_index, &max_index);
         }

         assert(min_index <= max_index);

         start_vertex = min_index + info->index_bias;
         num_vertices = max_index + 1 - min_index;

         /* Primitive restart doesn't work when unrolling indices.
          * We would have to break this drawing operation into several ones. */
         /* Use some heuristic to see if unrolling indices improves
          * performance. */
         if (!info->primitive_restart &&
             num_vertices > info->count*2 &&
             num_vertices-info->count > 32 &&
             !u_vbuf_mapping_vertex_buffer_blocks(mgr)) {
            /*printf("num_vertices=%i count=%i\n", num_vertices, info->count);*/
            unroll_indices = TRUE;
            user_vb_mask &= ~(mgr->nonzero_stride_vb_mask &
                              mgr->ve->noninstance_vb_mask_any);
         }
      } else {
         /* Nothing to do for per-vertex attribs. */
         start_vertex = 0;
         num_vertices = 0;
         min_index = 0;
      }
   } else {
      start_vertex = info->start;
      num_vertices = info->count;
      min_index = 0;
   }

   /* Translate vertices with non-native layouts or formats. */
   if (unroll_indices ||
       mgr->incompatible_vb_mask ||
       mgr->ve->incompatible_elem_mask) {
      /* XXX check the return value */
      u_vbuf_translate_begin(mgr, start_vertex, num_vertices,
                             info->start_instance, info->instance_count,
                             info->start, info->count, min_index,
                             unroll_indices);

      user_vb_mask &= ~(mgr->incompatible_vb_mask |
                        mgr->ve->incompatible_vb_mask_all);
   }

   /* Upload user buffers. */
   if (user_vb_mask) {
      u_vbuf_upload_buffers(mgr, start_vertex, num_vertices,
                            info->start_instance, info->instance_count);
   }

   /*
   if (unroll_indices) {
      printf("unrolling indices: start_vertex = %i, num_vertices = %i\n",
             start_vertex, num_vertices);
      util_dump_draw_info(stdout, info);
      printf("\n");
   }

   unsigned i;
   for (i = 0; i < mgr->nr_vertex_buffers; i++) {
      printf("input %i: ", i);
      util_dump_vertex_buffer(stdout, mgr->vertex_buffer+i);
      printf("\n");
   }
   for (i = 0; i < mgr->nr_real_vertex_buffers; i++) {
      printf("real %i: ", i);
      util_dump_vertex_buffer(stdout, mgr->real_vertex_buffer+i);
      printf("\n");
   }
   */

   u_upload_unmap(mgr->uploader);
   pipe->set_vertex_buffers(pipe, mgr->nr_real_vertex_buffers,
                            mgr->real_vertex_buffer);

   if (unlikely(unroll_indices)) {
      struct pipe_draw_info new_info = *info;
      new_info.indexed = FALSE;
      new_info.index_bias = 0;
      new_info.min_index = 0;
      new_info.max_index = info->count - 1;
      new_info.start = 0;

      pipe->draw_vbo(pipe, &new_info);
   } else {
      pipe->draw_vbo(pipe, info);
   }

   if (mgr->using_translate) {
      u_vbuf_translate_end(mgr);
   }
   mgr->vertex_buffers_dirty = TRUE;
}

void u_vbuf_save_vertex_elements(struct u_vbuf *mgr)
{
   assert(!mgr->ve_saved);
   mgr->ve_saved = mgr->ve;
}

void u_vbuf_restore_vertex_elements(struct u_vbuf *mgr)
{
   if (mgr->ve != mgr->ve_saved) {
      struct pipe_context *pipe = mgr->pipe;

      mgr->ve = mgr->ve_saved;
      pipe->bind_vertex_elements_state(pipe,
                                       mgr->ve ? mgr->ve->driver_cso : NULL);
   }
   mgr->ve_saved = NULL;
}

void u_vbuf_save_vertex_buffers(struct u_vbuf *mgr)
{
   util_copy_vertex_buffers(mgr->vertex_buffer_saved,
                            &mgr->nr_vertex_buffers_saved,
                            mgr->vertex_buffer,
                            mgr->nr_vertex_buffers);
}

void u_vbuf_restore_vertex_buffers(struct u_vbuf *mgr)
{
   unsigned i;

   u_vbuf_set_vertex_buffers(mgr, mgr->nr_vertex_buffers_saved,
                             mgr->vertex_buffer_saved);
   for (i = 0; i < mgr->nr_vertex_buffers_saved; i++) {
      pipe_resource_reference(&mgr->vertex_buffer_saved[i].buffer, NULL);
   }
   mgr->nr_vertex_buffers_saved = 0;
}