Repository: nvpro-samples/gl_cadscene_rendertechniques
Branch: master
Commit: bd7e727c8b03
Files: 47
Total size: 347.6 KB

Directory structure:
gitextract_wdi1bw94/

├── .gitignore
├── CMakeLists.txt
├── CONTRIBUTING
├── LICENSE
├── README.md
├── cadscene.cpp
├── cadscene.hpp
├── common.h
├── csf.cpp
├── csfviewer.cpp
├── cull-bitpack.vert.glsl
├── cull-downsample.frag.glsl
├── cull-downsample.vert.glsl
├── cull-raster.frag.glsl
├── cull-raster.geo.glsl
├── cull-raster.vert.glsl
├── cull-tokencmds.vert.glsl
├── cull-tokensizes.vert.glsl
├── cull-xfb.vert.glsl
├── cullingsystem.cpp
├── cullingsystem.hpp
├── nodetree.cpp
├── nodetree.hpp
├── nvtoken.cpp
├── nvtoken.hpp
├── renderer.cpp
├── renderer.hpp
├── rendererindexedmdi.cpp
├── renderertoken.cpp
├── renderertokensortcull.cpp
├── renderertokenstream.cpp
├── rendereruborange.cpp
├── rendererubosub.cpp
├── scan.comp.glsl
├── scansystem.cpp
├── scansystem.hpp
├── scene.frag.glsl
├── scene.vert.glsl
├── statesystem.cpp
├── statesystem.hpp
├── tokenbase.cpp
├── tokenbase.hpp
├── transform-leaves.comp.glsl
├── transform-level.comp.glsl
├── transformsystem.cpp
├── transformsystem.hpp
└── xplode-animation.comp.glsl

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.clang-format
.editorconfig

#############################
#Spirv
#############################
*.spv
*.spva
*.sass
*.sassbin
*.bat

#############################
#specific to the project
#############################
cmake_built
cmake_build
build
_install
bin_x64
NVPRO_EXTERNAL
nvpro_core

================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.5)
get_filename_component(PROJNAME ${CMAKE_CURRENT_SOURCE_DIR} NAME)
Project(${PROJNAME})
Message(STATUS "-------------------------------")
Message(STATUS "Processing Project ${PROJNAME}:")

#####################################################################################
# look for nvpro_core 1) as a sub-folder 2) at some other locations
# this cannot be put anywhere else since we still didn't find setup.cmake yet
#
if(NOT BASE_DIRECTORY)

  find_path(BASE_DIRECTORY
    NAMES nvpro_core/cmake/setup.cmake
    PATHS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/../.. 
    REQUIRED
    DOC "Directory containing nvpro_core"
    )
endif()
if(EXISTS ${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
  include(${BASE_DIRECTORY}/nvpro_core/cmake/setup.cmake)
  include(${BASE_DIRECTORY}/nvpro_core/cmake/utilities.cmake)
else()
  message(FATAL_ERROR "could not find base directory, please set BASE_DIRECTORY to folder containing nvpro_core")
endif()

_add_project_definitions(${PROJNAME})

#--------------------------------------------------------------------------------------------------
# Resources
#
download_files(FILENAMES geforce.csf.gz)

#####################################################################################
# additions from packages needed for this sample
# add refs  in LIBRARIES_OPTIMIZED
# add refs  in LIBRARIES_DEBUG
# add files in PACKAGE_SOURCE_FILES
#
_add_package_OpenGL()
_add_package_ImGUI()
_add_package_ZLIB()

add_definitions(-DCSF_SUPPORT_ZLIB=1)

#####################################################################################
# process the rest of some cmake code that needs to be done *after* the packages add
_add_nvpro_core_lib()

#####################################################################################
# Source files for this project
#
file(GLOB SOURCE_FILES *.cpp *.hpp *.inl *.h *.c)
file(GLOB GLSL_FILES *.glsl)


#####################################################################################
# Executable
#
if(WIN32)
  add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()

add_executable(${PROJNAME} ${SOURCE_FILES} ${COMMON_SOURCE_FILES} ${PACKAGE_SOURCE_FILES} ${GLSL_FILES})

#####################################################################################
# common source code needed for this sample
#
source_group(common FILES 
  ${COMMON_SOURCE_FILES}
  ${PACKAGE_SOURCE_FILES}
)
source_group(shaders FILES 
  ${GLSL_FILES}
)

#####################################################################################
# Linkage
#
target_link_libraries(${PROJNAME} ${PLATFORM_LIBRARIES} nvpro_core)

foreach(DEBUGLIB ${LIBRARIES_DEBUG})
  target_link_libraries(${PROJNAME} debug ${DEBUGLIB})
endforeach(DEBUGLIB)

foreach(RELEASELIB ${LIBRARIES_OPTIMIZED})
  target_link_libraries(${PROJNAME} optimized ${RELEASELIB})
endforeach(RELEASELIB)

#####################################################################################
# copies binaries that need to be put next to the exe files (ZLib, etc.)
#
_finalize_target( ${PROJNAME} )
LIST(APPEND GLSL_FILES "common.h")
install(FILES ${GLSL_FILES} CONFIGURATIONS Release DESTINATION "bin_${ARCH}/GLSL_${PROJNAME}")
install(FILES ${GLSL_FILES} CONFIGURATIONS Debug DESTINATION "bin_${ARCH}_debug/GLSL_${PROJNAME}")


================================================
FILE: CONTRIBUTING
================================================
https://developercertificate.org/

Developer Certificate of Origin
Version 1.1

Copyright (C) 2004, 2006 The Linux Foundation and its contributors.

Everyone is permitted to copy and distribute verbatim copies of this
license document, but changing it is not allowed.


Developer's Certificate of Origin 1.1

By making a contribution to this project, I certify that:

(a) The contribution was created in whole or in part by me and I
    have the right to submit it under the open source license
    indicated in the file; or

(b) The contribution is based upon previous work that, to the best
    of my knowledge, is covered under an appropriate open source
    license and I have the right under that license to submit that
    work with modifications, whether created in whole or in part
    by me, under the same open source license (unless I am
    permitted to submit under a different license), as indicated
    in the file; or

(c) The contribution was provided directly to me by some other
    person who certified (a), (b) or (c) and I have not modified
    it.

(d) I understand and agree that this project and the contribution
    are public and that a record of the contribution (including all
    personal information I submit with it, including my sign-off) is
    maintained indefinitely and may be redistributed consistent with
    this project or the open source license(s) involved.

================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

================================================
FILE: README.md
================================================
# gl cadscene render techniques

This sample implements several scene rendering techniques that target mostly static data, such as often found in CAD or DCC applications. In this context, 'static' means that the vertex and index buffers for the scene's objects rarely change. This can include editing the geometry of a few scene objects, but the matrix and material values are the properties that are modified the most across frames. Imagine making edits to the wheel topology of a car, or positioning an engine; the rest of the assembly remains the same.

The principal OpenGL mechanisms that are used here are described in the [SIGGRAPH 2014 presentation slides](http://on-demand.gputechconf.com/siggraph/2014/presentation/SG4117-OpenGL-Scene-Rendering-Techniques.pdf). It is highly recommended to go through the slides first.

The sample makes use of multiple OpenGL 4 core features, such as **ARB_multi_draw_indirect**, but also showcases OpenGL 3 style rendering techniques.

There are also several techniques built around the **NV_command_list** extension. Please refer to [gl commandlist basic](https://github.com/nvpro-samples/gl_commandlist_basic) for an introduction to NV_command_list.

> Note: This is just a sample to illustrate several techniques and possibilities for how to approach rendering. Its purpose is not to provide production-level, highly optimized implementations.

### Scene Setup

The sample loads a cadscene file (csf). This file format is inspired by CAD applications' data organization, but (for simplicity) everything is stored in a single RAW file.

The scene is organized into:

 * Matrices: object transforms as well as concatenated world matrices 
 * TreeNodes: a tree consisting hierarchical information, mapping to Matrix indices

 * Materials: just classic two-sided OpenGL Blinn-Phong material parameters
 * Geometries: storing vertex and index information, organized into
  * GeometryParts, which reference a sub-range within index buffer, for either "wireframe" or "solid" surfaces

 * Objects, that reference Geometry and have corresponding
  * ObjectParts, that encode part-level Material and Matrix assignment. Typically, an object uses just one Matrix for all its parts.

### Shademodes

![sample screenshot](https://github.com/nvpro-samples/gl_cadscene_rendertechniques/blob/master/doc/sample.jpg)

- **solid**: only triangles are drawn
- **solid with edges**: triangles and edge outlines on top (using PolygonOffset to push triangles back). When no global sorting (see later) is performed, this means we toggle between the two modes for every object.
- **solid with edges (split test, only in sorted)**: an artificial mode that also separates triangles and edges into different FBOs, and is available in "sorted" and "token" renderers. The implementation has no real use-case character and is more or less for internal benchmarking of FBO toggles.

### Strategies

These influence the number of drawcalls we generate for the hardware and software. Using OpenGL's MultiDraw* functions we can have less software calls than hardware drawcalls, which helps trigger faster paths in the driver as there is less validation overhead. A strategy is applied on a per-object level.

Imagine an object whose parts use two materials, red and blue:

```
material: r b b r
parts:    A B C D
```

- **materialgroups**
Here we create a per-object cache of drawcall ranges for MultiDraw* based on the object's material and matrix assignments. We also "grow" drawcalls if subsequent ranges in the index buffer have the same assignments. Our sample object would be drawn using 2 states one glMultiDrawElements each, which are creating 3 hardware drawcalls: red are ranges A, D and blue is B+C joined together as they are next to each other in the indexbuffer.
- **drawcall join**
As we traverse we combine drawcalls under same state, this means 3 drawcalls for hardware, and 3 for software as well as 3 states: red A, blue B+C, red D.
- **drawcall individual**
We render each piece individually:
red A, blue B, C, red D.

Typically we do all rendering with basic state redundancy filtering so we don't setup a matrix/material change if the same is still active. To keep things simple for state redundancy filtering, you should not go too fine-grained, otherwise all the tracking causes too much memory hopping. In our case we have 3 indices we track: geometry (handles vertex / index buffer setup), material, and matrix.

### Renderers
Most renderers will traverse the scene data every frame. The organization of the data is cache-friendly foremost, everything is stored in arrays, without too much memory hopping. Some renderers may implement additional caching for rendering.

#### Variants:

 - **bindless**: these variants make use of NVIDIA's bindless extensions NV_vertex_buffer_unified_memory and NV_uniform_buffer_unified_memory, which allows a lower-overhead path in the driver for faster drawcall submission. Classic glBindVertexBuffer or glBindBufferRange are replaced with glBufferAddressRangeNV.
 - **sorted**: indicates we do a global scene sort once, to minimize state changes in subsequent frames.
 - **cullsorted**: next to global sorting by state, we also apply occlusion culling as presented in [end of the slides](http://on-demand.gputechconf.com/siggraph/2014/presentation/SG4117-OpenGL-Scene-Rendering-Techniques.pdf) or in the [gl occlusion culling](https://github.com/nvpro-samples/gl_occlusion_culling) sample.
 - **emulated**: several of the NV_command_list techniques can be run in emulated mode.

#### Techniques:

We are mostly looking into accelerating our matrix and material parameter switching performance.

- **uborange**
All matrices and materials are stored in big buffer objects, which allows us to efficiently bind the required sub-range for a drawcall via glBindBufferRange(GL_UNIFORM_BUFFER, usageSlot, buffer, index * itemSize, itemSize). NVIDIA provides optimized paths if you keep the buffer and itemSize for a usageSlot constant for many glBindBufferRange calls. Be aware of GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, which is 256 bytes for most current NVIDIA hardware (Fermi, Kepler, Maxwell).

- **ubosub**
Not as efficient as the above, but maybe appropriate if you cannot afford to cache parameter data. We make use of one streaming buffer per usage slot and continously update it via glBufferSubData. NVIDIA's drivers do particularly well if you never bind this buffer as anything but a GL_UNIFORM_BUFFER and keep size and offsets a multiple of 4.

- **indexedmdi**
Similar to uborange we make use of all data stored in a bigger buffers in advance. It doesn't make this data "static"; you can always update the portions you need, but there is a high chance a lot of data is the same frame to frame. This time, we do not bind memory ranges through the OpenGL API, but let the shader do an indirection and only pass the required matrix and material indices. 
For the matrix data we use GL_TEXTURE_BUFFER as it's particularly performant for high frequency / potentially divergent access. We typically have far more matrices than materials in our scene. For material data, it's a bit "ugly" to use lots of texelFetch instructions decoding all our parameters; it's much easier to write them as structs and store the array either as GL_UNIFORM_BUFFER or GL_SHADER_STORAGE_BUFFER. The latter is only recommended if you have divergent shader access or exceed the 64 KB limit of UBOs.
To pass the indices per-drawcall we make use of GL_ARB_multi_draw_indirect and "instanced" vertex attributes as described at [GTC 2013 on slide 27](http://on-demand.gputechconf.com/gtc/2013/presentations/S3032-Advanced-Scenegraph-Rendering-Pipeline.pdf).
Therefore this renderer requires two additional buffers: one encoding our object's matrix and material index assignments, and one encoding the scene's drawcalls as GL_DRAW_INDIRECT_BUFFER. 

A hybrid approach, where the parameter index like "indexedmdi" is used for matrices and uborange bind is used for materials, is not yet implemented, but would be a good compromise.

The following renderers make use of the **NV_command_list** extension. In principle they **behave as "uborange"**, however all buffer bindings and drawcalls are encoded into binary tokens that are submitted in bulk. In preparation for drawing, the appropriate stateobjects are created and reused when rendering (one for lines and for triangles). While stateobject capturing is not extremely expensive, it is still best to cache it across frames.

- **tokenbuffer**
Similar to indexedmdi we create a buffer that describes our scene by storing all the relevant token commands. This buffer is filled only once and then later reused.
- **tokenlist**
Instead of storing the tokens inside a buffer we make use of the commandlist object, and create and compile one for each shademode for later reuse. Every time our state changes (for instance, when resizing FBOs), we have to recreate these lists, which makes it less flexible than buffer but faster when there are lots of statechanges within the list.
- **tokenstream**
This approach does not reuse the tokens across frames, but instead dynamically creates the tokenstream every frame. By default, the demo fills and submits tokens in chunks of 256 KB; better values may exist depending on the scene.

### Performance

All timings are preliminary results for *Timer Draw* on a win7-64, i7-860, Quadro K5000 system. 

**Important Note About Timer Query Results:** The GPU time reported below is measured via timer queries, those values however can be skewed by CPU bottlenecks. The "begin" timestamp may be part of a different command submission to the GPU than the "end" timestamp. That means a long delay on the CPU side between those submissions will also increase the reported GPU time. That is why in CPU-bottlenecked scenarios with tons of OpenGL commands, the GPU times below are close to the CPU time.

```
scene statistics:
geometries:    110
materials:      66
nodes:        5004
objects:      2497

tokenbuffer/glstream complexities:
type: solid              materialgroups | drawcall individual
commandsize:                     347292 | 1301692
statetoggles:                         1 | 1
tokens:                 
GL_DRAW_ELEMENTS_COMMAND_NV:      11103 |   68452
GL_ELEMENT_ADDRESS_COMMAND_NV:      807 |     807
GL_ATTRIBUTE_ADDRESS_COMMAND_NV:    807 |     807
GL_UNIFORM_ADDRESS_COMMAND_NV:     8988 |   11289
GL_POLYGON_OFFSET_COMMAND_NV:         1 |       1

type: solid w edges
commandsize:                     629644 | 2534412
statetoggles:                      4994 |    4994
tokens:
GL_DRAW_ELEMENTS_COMMAND_NV:      22281 |  136750
GL_ELEMENT_ADDRESS_COMMAND_NV:      807 |     807
GL_ATTRIBUTE_ADDRESS_COMMAND_NV:    807 |     807
GL_UNIFORM_ADDRESS_COMMAND_NV:    15457 |   20036
GL_POLYGON_OFFSET_COMMAND_NV:         1 |       1
```

As one can see from the statistics the key difference is the number of drawcalls for the hardware:
- **materialgroups**: ~ 10 000 drawcalls (inner two columns)
- **drawcall individual**: ~ 70 000 drawcalls (rightmost two columns)

*shademode: solid*

renderer | GPU time | CPU time | GPU time | CPU time (microseconds)
------------ | ------------- | ------------- | ------------- | -------------
**strategy** | **material-** | **-groups** | **drawcall-** | **-individual**
ubosub | 1550 | 1870 |  6000 | 7420
uborange | 1010| 1890 | 3720 | 7660
uborange_bindless | 1010 | 1200 | 2560 | 4900
indexedmdi | 1120 | 1200 | 2080 | 1100
tokenstream | 860 | 300 | 1520 | 1400
tokenbuffer | 780 | <10 | 1230 | <10
tokenlist | 780 | <10 | 880 | <10
tokenbuffer_cullsorted | 540 | 120 | 760 | 120

The results are of course very scene dependent; this model was specifically chosen as it is made of many parts with very few triangles. If the complexity per drawcall were higher (say more triangles or complex shading), then the CPU impact would be lower and we would be GPU-bound. However the CPU time recovered by faster submission mechanisms can always be used elsewhere. So even if we are GPU-bound, time should not be wasted.

We can see that the "token" techniques do very well and are never CPU-bound, and the "indexedmdi" technique is also quite good. This technique is especially useful for very high-frequency parameters, for example when rendering "id-buffers" for selection, but also for matrix indices. For general use-cases, working with uborange binds is recommended. 

*shademode: solid with edges*

Unless "sorted", around 5000 toggles are done between triangles/line rendering. The shader
is manipulated through an immediate vertex attribute to toggle between lit/unlit rendering respectively.

renderer | GPU time | CPU time | GPU time | CPU time (microseconds)
------------ | ------------- | ------------- | ------------- | -------------
**strategy** | **material-** | **-groups** | **drawcall-** | **-individual**
ubosub | 2890 | 3350 | 13000 | 15000 | 
uborange | 2150 | 3700 | 12500 | 15200 | 
uborange_bindless | 2150 | 2640 | 8300 | 10000
indexedmdi | 2340 | 2200 | 4050 | 2050
tokenstream | 1860 | 1250 | 3360 | 3200
tokenbuffer | 1750 | 450 | 2650 | 350
tokenlist | 1650 | <10 | 1890 | <10
tokenbuffer_cullsorted | 770 | 120 | 1250 | 120

Compared to the "solid" results, the tokenbuffer and tokenlist techniques show a greater difference in CPU time.


### Model Explosion View

The simple viewer allows you to add animation to the scene and artificially increase scene complexity via "clones".

![xplodeclones](https://github.com/nvpro-samples/gl_cadscene_rendertechniques/blob/master/doc/xplodeclones.jpg)

To "emulate" typical interaction where users might move objects around or have animated scenes, the sample also implements the matrix transform system sketched on [slide 30](http://on-demand.gputechconf.com/siggraph/2014/presentation/SG4117-OpenGL-Scene-Rendering-Techniques.pdf). 

The effect works by first moving all object matrices a bit (*xplode-animation.comp.glsl*), and afterwards the transform hierarchy is updated via a system that is implemented in the *transformsystem.cpp / hpp* files.

The code is not particularly tuned but naively assumes that upper levels of the hierarchy contain fewer nodes than lower levels (pyramid). Therefore it uses leaf-processing (which redundantly calculates matrices) instead of level-wise processing for the first 10 levels, to avoid dependencies (one small compute task waiting for the previous). Later levels are always processed level-wise. A better strategy would be to switch between the two approaches based on the actual number of nodes per level. The shaders for this are *transform-leaves.comp.glsl* and *transform-level.comp.glsl*. 

The hierarchy is managed by *nodetree.cpp/hpp*, which stores the tree as array of 32bit values. Each value represents a node, and encodes the "level" in the hierarchy in 8 bits and their parent index in the rest of the bits. Which means you can traverse a node up to the root:

``` cpp
// sample traversal of "idx" node to root
self = array[idx];
while( self.level != 0) {
  self = array[self.parent];
}
// self is now the top root for the idx node
```

The nodetree also stores two node index lists for each level: one storing all nodes of a level, and one for all leaves in this level. We feed these two index lists to the appropriate shader. When leaf processing is used we append the leaves level-wise, which should minimize divergence within a warp (ideally most threads have the same number of levels to ascend in the hierarchy).

Many CAD applications tend to use double-precision matrices, and the system could be adjusted for this. For rendering, however, float matrices should be used. To account for large translation values, one could run a concatenation of view-projection (double) and object-world-matrix (double) per-frame and generate the matrices (float) for actual vertex transforms. To improve memory performance, it might be beneficial to use double only for storing translations within the matrices.

> Note: Only the GPU matrices are updated. CPU techniques such as "ubosub" will not show animations.

### Sample Highlights

This sample is a bit more complex than most others as it contains several subsystems. Don't hesitate to contact the author if something is unclear (commenting was not a priority ;) ).

#### csfviewer.cpp
The principle setup of the sample is in this main file. However, most of the interesting bits happen in the renderers.

- Sample::think - prepares the frame and calls the renderer's draw function

#### renderer... and tokenbase...
Each renderer has its own file and is derived from the **Renderer** class in *renderer.hpp*

- Renderer::init - some renderers may allocate extra buffers or create their own data structures for the scene.
- Renderer::deinit 
- Renderer::draw

The renderers may have additional functions. The "token" renderers using NV_command_list or "indexedmdi", for instance, must create their own scene representation.

#### cadscene...
The "csf" (cadscene file) format is a simple binary format that encodes a scene as is typical for CAD. It closely matches the description at the beginning of the readme. It is not very sophisticated, and is meant for demo purposes.

> *Note*: The **geforce.csf.gz** assembly binary file that ships with this sample **may NOT be redistributed.**

#### nodetree... and transform...
Implement the matrix hierarchy updates as described in the "model explosion view" section.

#### cull... and scan...
For files related to culling, it is best to refer to the [gl occlusion cullling](https://github.com/nvpro-samples/gl_occlusion_cullling) sample, as it leverages the same system and focuses on just that topic.

*renderertokensortcull.cpp* implements *RendererCullSortToken::CullJobToken::resultFromBits*, which contains the details of how the occlusion results are handled in this sample. The implementation uses the "raster" "temporal" approach.

#### statesystem... nvtoken... and nvcommandlist...
These files contain helpers when using the NV_command_list extension. Please see [gl commandlist basic](https://github.com/nvpro-samples/gl_commandlist_basic) for a smaller sample.

### Building
Ideally, clone this and other interesting [nvpro-samples](https://github.com/nvpro-samples) repositories into a common subdirectory. You will always need [nvpro_core](https://github.com/nvpro-samples/nvpro_core). The nvpro_core is searched either as a subdirectory of the sample, or one directory up.

If you are interested in multiple samples, you can use the [build_all](https://github.com/nvpro-samples/build_all) CMAKE as entry point. This will also give you options to enable or disable individual samples when creating the solutions.

### Related Samples
[gl commandlist basic](https://github.com/nvpro-samples/gl_commandlist_basic) illustrates the core principle of the NV_command_list extension.
[gl occlusion cullling](https://github.com/nvpro-samples/gl_occlusion_cullling) also uses the occlusion system of this sample, but in a simpler usage scenario.

When using classic scenegraphs, there is typically a lot of overhead in traversing the scene. For this reason, it is highly recommended to use simpler representations for actual rendering. Consider using flattened hierarchies, arrays, memory-friendly data structures, data-oriented design patterns, and similar techniques.
If you are still working with a classic scenegraph, then [nvpro-pipeline](https://github.com/nvpro-pipeline/pipeline) may provide some acceleration strategies to avoid full scenegraph traversal. Some of these strategies are also described in this [GTC 2013 presentation](http://on-demand.gputechconf.com/gtc/2013/presentations/S3032-Advanced-Scenegraph-Rendering-Pipeline.pdf).


================================================
FILE: cadscene.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "cadscene.hpp"
#include <fileformats/cadscenefile.h>

#include <algorithm>
#include <assert.h>
#include <cstddef>
#include "glm/gtc/type_ptr.hpp"

#define USE_CACHECOMBINE 1


glm::vec4 randomVector(float from, float to)
{
  glm::vec4 vec;
  float     width = to - from;
  for(int i = 0; i < 4; i++)
  {
    vec[i] = from + (float(rand()) / float(RAND_MAX)) * width;
  }
  return vec;
}

static void recursiveHierarchy(NodeTree& tree, CSFile* csf, int idx, int cloneoffset)
{
  for(int i = 0; i < csf->nodes[idx].numChildren; i++)
  {
    tree.setNodeParent((NodeTree::nodeID)csf->nodes[idx].children[i] + cloneoffset, (NodeTree::nodeID)idx + cloneoffset);
  }

  for(int i = 0; i < csf->nodes[idx].numChildren; i++)
  {
    recursiveHierarchy(tree, csf, csf->nodes[idx].children[i], cloneoffset);
  }
}

bool CadScene::loadCSF(const char* filename, int clones, int cloneaxis)
{
  CSFile*         csf;
  CSFileMemoryPTR mem = CSFileMemory_new();
  if(CSFile_loadExt(&csf, filename, mem) != CADSCENEFILE_NOERROR || !(csf->fileFlags & CADSCENEFILE_FLAG_UNIQUENODES))
  {
    CSFileMemory_delete(mem);
    return false;
  }

  int copies = clones + 1;

  CSFile_transform(csf);

  srand(234525);

  // materials
  m_materials.resize(csf->numMaterials);
  for(int n = 0; n < csf->numMaterials; n++)
  {
    CSFMaterial* csfmaterial = &csf->materials[n];
    Material&    material    = m_materials[n];

    for(int i = 0; i < 2; i++)
    {
      material.sides[i].ambient  = randomVector(0.0f, 0.1f);
      material.sides[i].diffuse  = glm::make_vec4(csf->materials[n].color) + randomVector(0.0f, 0.07f);
      material.sides[i].specular = randomVector(0.25f, 0.55f);
      material.sides[i].emissive = randomVector(0.0f, 0.05f);
    }
  }

  glCreateBuffers(1, &m_materialsGL);
  glNamedBufferStorage(m_materialsGL, sizeof(Material) * m_materials.size(), &m_materials[0], 0);
  //glMapNamedBufferRange(m_materialsGL, 0, sizeof(Material) * m_materials.size(), GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT);

  // geometry
  int numGeoms = csf->numGeometries;
  m_geometry.resize(csf->numGeometries * copies);
  m_geometryBboxes.resize(csf->numGeometries * copies);
  for(int n = 0; n < csf->numGeometries; n++)
  {
    CSFGeometry* csfgeom = &csf->geometries[n];
    Geometry&    geom    = m_geometry[n];

    geom.cloneIdx = -1;

    geom.numVertices   = csfgeom->numVertices;
    geom.numIndexSolid = csfgeom->numIndexSolid;
    geom.numIndexWire  = csfgeom->numIndexWire;

    std::vector<Vertex> vertices(csfgeom->numVertices);
    for(int i = 0; i < csfgeom->numVertices; i++)
    {
      vertices[i].position[0] = csfgeom->vertex[3 * i + 0];
      vertices[i].position[1] = csfgeom->vertex[3 * i + 1];
      vertices[i].position[2] = csfgeom->vertex[3 * i + 2];
      vertices[i].position[3] = 1.0f;
      if(csfgeom->normal)
      {
        vertices[i].normal[0] = csfgeom->normal[3 * i + 0];
        vertices[i].normal[1] = csfgeom->normal[3 * i + 1];
        vertices[i].normal[2] = csfgeom->normal[3 * i + 2];
        vertices[i].normal[3] = 0.0f;
      }
      else
      {
        vertices[i].normal = glm::vec4(normalize(glm::vec3(vertices[i].position)), 0.0f);
      }


      m_geometryBboxes[n].merge(vertices[i].position);
    }

    geom.vboSize = sizeof(Vertex) * vertices.size();

    glCreateBuffers(1, &geom.vboGL);
    glNamedBufferStorage(geom.vboGL, geom.vboSize, &vertices[0], 0);

    std::vector<GLuint> indices(csfgeom->numIndexSolid + csfgeom->numIndexWire);
    memcpy(&indices[0], csfgeom->indexSolid, sizeof(GLuint) * csfgeom->numIndexSolid);
    if(csfgeom->indexWire)
    {
      memcpy(&indices[csfgeom->numIndexSolid], csfgeom->indexWire, sizeof(GLuint) * csfgeom->numIndexWire);
    }

    geom.iboSize = sizeof(GLuint) * indices.size();

    glCreateBuffers(1, &geom.iboGL);
    glNamedBufferStorage(geom.iboGL, geom.iboSize, &indices[0], 0);

    if(has_GL_NV_vertex_buffer_unified_memory)
    {
      glGetNamedBufferParameterui64vNV(geom.vboGL, GL_BUFFER_GPU_ADDRESS_NV, &geom.vboADDR);
      glMakeNamedBufferResidentNV(geom.vboGL, GL_READ_ONLY);

      glGetNamedBufferParameterui64vNV(geom.iboGL, GL_BUFFER_GPU_ADDRESS_NV, &geom.iboADDR);
      glMakeNamedBufferResidentNV(geom.iboGL, GL_READ_ONLY);
    }

    geom.parts.resize(csfgeom->numParts);

    size_t offsetSolid = 0;
    size_t offsetWire  = csfgeom->numIndexSolid * sizeof(GLuint);
    for(int i = 0; i < csfgeom->numParts; i++)
    {
      geom.parts[i].indexWire.count  = csfgeom->parts[i].numIndexWire;
      geom.parts[i].indexSolid.count = csfgeom->parts[i].numIndexSolid;

      geom.parts[i].indexWire.offset  = offsetWire;
      geom.parts[i].indexSolid.offset = offsetSolid;

      offsetWire += csfgeom->parts[i].numIndexWire * sizeof(GLuint);
      offsetSolid += csfgeom->parts[i].numIndexSolid * sizeof(GLuint);
    }
  }
  for(int c = 1; c <= clones; c++)
  {
    for(int n = 0; n < numGeoms; n++)
    {
      m_geometryBboxes[n + numGeoms * c] = m_geometryBboxes[n];

      const Geometry& geomorig = m_geometry[n];
      Geometry&       geom     = m_geometry[n + numGeoms * c];

      geom = geomorig;

#if 1
      geom.cloneIdx = n;
#else
      geom.cloneIdx = -1;
      glCreateBuffers(1, &geom.vboGL);
      glNamedBufferStorage(geom.vboGL, geom.vboSize, 0, 0);

      glCreateBuffers(1, &geom.iboGL);
      glNamedBufferStorage(geom.iboGL, geom.iboSize, 0, 0);

      if(has_GL_NV_vertex_buffer_unified_memory)
      {
        glGetNamedBufferParameterui64vNV(geom.vboGL, GL_BUFFER_GPU_ADDRESS_NV, &geom.vboADDR);
        glMakeNamedBufferResidentNV(geom.vboGL, GL_READ_ONLY);

        glGetNamedBufferParameterui64vNV(geom.iboGL, GL_BUFFER_GPU_ADDRESS_NV, &geom.iboADDR);
        glMakeNamedBufferResidentNV(geom.iboGL, GL_READ_ONLY);
      }

      glCopyNamedBufferSubData(geomorig.vboGL, geom.vboGL, 0, 0, geom.vboSize);
      glCopyNamedBufferSubData(geomorig.iboGL, geom.iboGL, 0, 0, geom.iboSize);
#endif
    }
  }


  glCreateBuffers(1, &m_geometryBboxesGL);
  glNamedBufferStorage(m_geometryBboxesGL, sizeof(BBox) * m_geometryBboxes.size(), &m_geometryBboxes[0], 0);
  glCreateTextures(GL_TEXTURE_BUFFER, 1, &m_geometryBboxesTexGL);
  glTextureBuffer(m_geometryBboxesTexGL, GL_RGBA32F, m_geometryBboxesGL);

  // nodes
  int numObjects = 0;
  m_matrices.resize(csf->numNodes * copies);

  for(int n = 0; n < csf->numNodes; n++)
  {
    CSFNode* csfnode = &csf->nodes[n];

    memcpy(glm::value_ptr(m_matrices[n].objectMatrix), csfnode->objectTM, sizeof(float) * 16);
    memcpy(glm::value_ptr(m_matrices[n].worldMatrix), csfnode->worldTM, sizeof(float) * 16);

    m_matrices[n].objectMatrixIT = glm::transpose(glm::inverse(m_matrices[n].objectMatrix));
    m_matrices[n].worldMatrixIT  = glm::transpose(glm::inverse(m_matrices[n].worldMatrix));

    if(csfnode->geometryIDX < 0)
      continue;

    numObjects++;
  }


  // objects
  m_objects.resize(numObjects * copies);
  m_objectAssigns.resize(numObjects * copies);
  numObjects = 0;
  for(int n = 0; n < csf->numNodes; n++)
  {
    CSFNode* csfnode = &csf->nodes[n];

    if(csfnode->geometryIDX < 0)
      continue;

    Object& object = m_objects[numObjects];

    object.matrixIndex   = n;
    object.geometryIndex = csfnode->geometryIDX;

    m_objectAssigns[numObjects] = glm::ivec2(object.matrixIndex, object.geometryIndex);

    object.parts.resize(csfnode->numParts);
    for(int i = 0; i < csfnode->numParts; i++)
    {
      object.parts[i].active        = 1;
      object.parts[i].matrixIndex   = csfnode->parts[i].nodeIDX < 0 ? object.matrixIndex : csfnode->parts[i].nodeIDX;
      object.parts[i].materialIndex = csfnode->parts[i].materialIDX;
    }

    BBox bbox = m_geometryBboxes[object.geometryIndex].transformed(m_matrices[n].worldMatrix);
    m_bbox.merge(bbox);

    updateObjectDrawCache(object);

    numObjects++;
  }

  // compute clone move delta based on m_bbox;

  glm::vec4 dim = m_bbox.max - m_bbox.min;

  int sq      = 1;
  int numAxis = 0;
  for(int i = 0; i < 3; i++)
  {
    numAxis += (cloneaxis & (1 << i)) ? 1 : 0;
  }

  assert(numAxis);

  switch(numAxis)
  {
    case 1:
      sq = copies;
      break;
    case 2:
      while(sq * sq < copies)
      {
        sq++;
      }
      break;
    case 3:
      while(sq * sq * sq < copies)
      {
        sq++;
      }
      break;
  }


  for(int c = 1; c <= clones; c++)
  {
    int numNodes = csf->numNodes;

    glm::vec4 shift = dim * 1.05f;

    float u = 0;
    float v = 0;
    float w = 0;

    switch(numAxis)
    {
      case 1:
        u = float(c);
        break;
      case 2:
        u = float(c % sq);
        v = float(c / sq);
        break;
      case 3:
        u = float(c % sq);
        v = float((c / sq) % sq);
        w = float(c / (sq * sq));
        break;
    }

    float use = u;

    if(cloneaxis & (1 << 0))
    {
      shift.x *= -use;
      if(numAxis > 1)
        use = v;
    }
    else
    {
      shift.x = 0;
    }

    if(cloneaxis & (1 << 1))
    {
      shift.y *= use;
      if(numAxis > 2)
        use = w;
      else if(numAxis > 1)
        use = v;
    }
    else
    {
      shift.y = 0;
    }

    if(cloneaxis & (1 << 2))
    {
      shift.z *= -use;
    }
    else
    {
      shift.z = 0;
    }

    shift.w = 0;

    // move all world matrices
    for(int n = 0; n < numNodes; n++)
    {
      MatrixNode& node     = m_matrices[n + numNodes * c];
      MatrixNode& nodeOrig = m_matrices[n];
      node                 = nodeOrig;
      node.worldMatrix[3]  = node.worldMatrix[3] + shift;
      node.worldMatrixIT   = glm::transpose(glm::inverse(node.worldMatrix));
    }

    {
      // patch object matrix of root
      MatrixNode& node     = m_matrices[csf->rootIDX + numNodes * c];
      node.objectMatrix[3] = node.objectMatrix[3] + shift;
      node.objectMatrixIT  = glm::transpose(glm::inverse(node.objectMatrix));
    }

    // clone objects
    for(int n = 0; n < numObjects; n++)
    {
      const Object& objectorig = m_objects[n];
      Object&       object     = m_objects[n + numObjects * c];

      object = objectorig;
      object.geometryIndex += c * numGeoms;
      object.matrixIndex += c * numNodes;
      for(size_t i = 0; i < object.parts.size(); i++)
      {
        object.parts[i].matrixIndex += c * numNodes;
      }
      for(size_t i = 0; i < object.cacheSolid.state.size(); i++)
      {
        object.cacheSolid.state[i].matrixIndex += c * numNodes;
      }
      for(size_t i = 0; i < object.cacheWire.state.size(); i++)
      {
        object.cacheWire.state[i].matrixIndex += c * numNodes;
      }

      m_objectAssigns[n + numObjects * c] = glm::ivec2(object.matrixIndex, object.geometryIndex);
    }
  }

  glCreateBuffers(1, &m_matricesGL);
  glNamedBufferStorage(m_matricesGL, sizeof(MatrixNode) * m_matrices.size(), &m_matrices[0], 0);
  //glMapNamedBufferRange(m_matricesGL, 0, sizeof(MatrixNode) * m_matrices.size(), GL_MAP_PERSISTENT_BIT | GL_MAP_WRITE_BIT);

  glCreateTextures(GL_TEXTURE_BUFFER, 1, &m_matricesTexGL);
  glTextureBuffer(m_matricesTexGL, GL_RGBA32F, m_matricesGL);

  glCreateBuffers(1, &m_objectAssignsGL);
  glNamedBufferStorage(m_objectAssignsGL, sizeof(glm::ivec2) * m_objectAssigns.size(), &m_objectAssigns[0], 0);

  if(has_GL_NV_vertex_buffer_unified_memory)
  {
    glGetNamedBufferParameterui64vNV(m_materialsGL, GL_BUFFER_GPU_ADDRESS_NV, &m_materialsADDR);
    glMakeNamedBufferResidentNV(m_materialsGL, GL_READ_ONLY);

    glGetNamedBufferParameterui64vNV(m_matricesGL, GL_BUFFER_GPU_ADDRESS_NV, &m_matricesADDR);
    glMakeNamedBufferResidentNV(m_matricesGL, GL_READ_ONLY);

    if(has_GL_ARB_bindless_texture)
    {
      m_matricesTexGLADDR = glGetTextureHandleARB(m_matricesTexGL);
      glMakeTextureHandleResidentARB(m_matricesTexGLADDR);
    }
  }

  m_nodeTree.create(copies * csf->numNodes);
  for(int i = 0; i < copies; i++)
  {
    int cloneoffset = (csf->numNodes) * i;
    int root        = csf->rootIDX + cloneoffset;
    recursiveHierarchy(m_nodeTree, csf, csf->rootIDX, cloneoffset);

    m_nodeTree.setNodeParent((NodeTree::nodeID)root, m_nodeTree.getTreeRoot());
    m_nodeTree.addToTree((NodeTree::nodeID)root);
  }

  glCreateBuffers(1, &m_parentIDsGL);
  glNamedBufferStorage(m_parentIDsGL, m_nodeTree.getTreeCompactNodes().size() * sizeof(GLuint),
                       &m_nodeTree.getTreeCompactNodes()[0], 0);

  glCreateBuffers(1, &m_matricesOrigGL);
  glNamedBufferStorage(m_matricesOrigGL, sizeof(MatrixNode) * m_matrices.size(), &m_matrices[0], 0);
  glCreateTextures(GL_TEXTURE_BUFFER, 1, &m_matricesOrigTexGL);
  glTextureBuffer(m_matricesOrigTexGL, GL_RGBA32F, m_matricesOrigGL);

  CSFileMemory_delete(mem);
  return true;
}


struct ListItem
{
  CadScene::DrawStateInfo state;
  CadScene::DrawRange     range;
};

static bool ListItem_compare(const ListItem& a, const ListItem& b)
{
  int diff = 0;
  diff     = diff != 0 ? diff : (a.state.materialIndex - b.state.materialIndex);
  diff     = diff != 0 ? diff : (a.state.matrixIndex - b.state.matrixIndex);
  diff     = diff != 0 ? diff : int(a.range.offset - b.range.offset);

  return diff < 0;
}

static void fillCache(CadScene::DrawRangeCache& cache, const std::vector<ListItem>& list)
{
  cache = CadScene::DrawRangeCache();

  if(!list.size())
    return;

  CadScene::DrawStateInfo state = list[0].state;
  CadScene::DrawRange     range = list[0].range;

  int stateCount = 0;

  for(size_t i = 1; i < list.size() + 1; i++)
  {
    bool newrange = false;
    if(i == list.size() || list[i].state != state)
    {
      // push range
      stateCount++;
      cache.offsets.push_back(range.offset);
      cache.counts.push_back(range.count);

      // emit
      cache.state.push_back(state);
      cache.stateCount.push_back(stateCount);

      stateCount = 0;

      if(i == list.size())
      {
        break;
      }
      else
      {
        state        = list[i].state;
        range.offset = list[i].range.offset;
        range.count  = 0;
        newrange     = true;
      }
    }

    const CadScene::DrawRange& currange = list[i].range;
    if(newrange || (USE_CACHECOMBINE && currange.offset == (range.offset + sizeof(GLuint) * range.count)))
    {
      // merge
      range.count += currange.count;
    }
    else
    {
      // push
      stateCount++;
      cache.offsets.push_back(range.offset);
      cache.counts.push_back(range.count);

      range = currange;
    }
  }
}

void CadScene::updateObjectDrawCache(Object& object)
{
  Geometry& geom = m_geometry[object.geometryIndex];

  std::vector<ListItem> listSolid;
  std::vector<ListItem> listWire;

  listSolid.reserve(geom.parts.size());
  listWire.reserve(geom.parts.size());

  for(size_t i = 0; i < geom.parts.size(); i++)
  {
    if(!object.parts[i].active)
      continue;

    ListItem item;
    item.state.materialIndex = object.parts[i].materialIndex;

    item.range             = geom.parts[i].indexSolid;
    item.state.matrixIndex = object.parts[i].matrixIndex;
    listSolid.push_back(item);

    item.range             = geom.parts[i].indexWire;
    item.state.matrixIndex = object.parts[i].matrixIndex;
    listWire.push_back(item);
  }

  std::sort(listSolid.begin(), listSolid.end(), ListItem_compare);
  std::sort(listWire.begin(), listWire.end(), ListItem_compare);

  fillCache(object.cacheSolid, listSolid);
  fillCache(object.cacheWire, listWire);
}

void CadScene::enableVertexFormat(int attrPos, int attrNormal)
{
  glVertexAttribFormat(attrPos, 3, GL_FLOAT, GL_FALSE, 0);
  glVertexAttribFormat(attrNormal, 3, GL_FLOAT, GL_FALSE, offsetof(CadScene::Vertex, normal));
  glVertexAttribBinding(attrPos, 0);
  glVertexAttribBinding(attrNormal, 0);
  glEnableVertexAttribArray(attrPos);
  glEnableVertexAttribArray(attrNormal);
  glBindVertexBuffer(0, 0, 0, sizeof(CadScene::Vertex));
}

void CadScene::disableVertexFormat(int attrPos, int attrNormal)
{
  glDisableVertexAttribArray(attrPos);
  glDisableVertexAttribArray(attrNormal);
  glBindVertexBuffer(0, 0, 0, sizeof(CadScene::Vertex));
}

void CadScene::unload()
{
  if(m_geometry.empty())
    return;

  glFinish();

  if(has_GL_NV_vertex_buffer_unified_memory)
  {
    if(has_GL_ARB_bindless_texture)
    {
      glMakeTextureHandleNonResidentARB(m_matricesTexGLADDR);
    }

    glMakeNamedBufferNonResidentNV(m_matricesGL);
    glMakeNamedBufferNonResidentNV(m_materialsGL);
  }

  glDeleteTextures(1, &m_matricesOrigTexGL);
  glDeleteTextures(1, &m_matricesTexGL);
  glDeleteTextures(1, &m_geometryBboxesTexGL);

  glDeleteBuffers(1, &m_matricesOrigGL);
  glDeleteBuffers(1, &m_matricesGL);
  glDeleteBuffers(1, &m_materialsGL);
  glDeleteBuffers(1, &m_objectAssignsGL);
  glDeleteBuffers(1, &m_geometryBboxesGL);
  glDeleteBuffers(1, &m_parentIDsGL);


  for(size_t i = 0; i < m_geometry.size(); i++)
  {
    if(m_geometry[i].cloneIdx >= 0)
      continue;

    if(has_GL_NV_vertex_buffer_unified_memory)
    {
      glMakeNamedBufferNonResidentNV(m_geometry[i].iboGL);
      glMakeNamedBufferNonResidentNV(m_geometry[i].vboGL);
    }
    glDeleteBuffers(1, &m_geometry[i].iboGL);
    glDeleteBuffers(1, &m_geometry[i].vboGL);
  }

  m_matrices.clear();
  m_geometryBboxes.clear();
  m_geometry.clear();
  m_objectAssigns.clear();
  m_objects.clear();
  m_geometryBboxes.clear();
  m_nodeTree.clear();

  glFinish();
}

void CadScene::resetMatrices()
{
  glCopyNamedBufferSubData(m_matricesOrigGL, m_matricesGL, 0, 0, sizeof(CadScene::MatrixNode) * m_matrices.size());
}


================================================
FILE: cadscene.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#ifndef CADSCENE_H__
#define CADSCENE_H__

#include <cstring> // memset
#include <nvgl/extensions_gl.hpp>
#include <glm/glm.hpp>
#include <vector>
#include "nodetree.hpp"

class CadScene {

public:

  struct BBox {
    glm::vec4    min;
    glm::vec4    max;

    BBox() : min(FLT_MAX), max(-FLT_MAX) {}

    inline void merge( const glm::vec4& point )
    {
      min = glm::min(min, point);
      max = glm::max(max, point);
    }

    inline void merge( const BBox& bbox )
    {
      min = glm::min(min, bbox.min);
      max = glm::max(max, bbox.max);
    }

    inline BBox transformed ( const glm::mat4 &matrix, int dim=3)
    {
      int i;
      glm::vec4 box[16];
      // create box corners
      box[0] = glm::vec4(min.x,min.y,min.z,min.w);
      box[1] = glm::vec4(max.x,min.y,min.z,min.w);
      box[2] = glm::vec4(min.x,max.y,min.z,min.w);
      box[3] = glm::vec4(max.x,max.y,min.z,min.w);
      box[4] = glm::vec4(min.x,min.y,max.z,min.w);
      box[5] = glm::vec4(max.x,min.y,max.z,min.w);
      box[6] = glm::vec4(min.x,max.y,max.z,min.w);
      box[7] = glm::vec4(max.x,max.y,max.z,min.w);

      box[8] = glm::vec4(min.x,min.y,min.z,max.w);
      box[9] = glm::vec4(max.x,min.y,min.z,max.w);
      box[10] = glm::vec4(min.x,max.y,min.z,max.w);
      box[11] = glm::vec4(max.x,max.y,min.z,max.w);
      box[12] = glm::vec4(min.x,min.y,max.z,max.w);
      box[13] = glm::vec4(max.x,min.y,max.z,max.w);
      box[14] = glm::vec4(min.x,max.y,max.z,max.w);
      box[15] = glm::vec4(max.x,max.y,max.z,max.w);

      // transform box corners
      // and find new mins,maxs
      BBox bbox;

      for (i = 0; i < (1<<dim) ; i++){
        glm::vec4 point = matrix * box[i];
        bbox.merge(point);
      }

      return bbox;
    }
  };

  struct MaterialSide {
    glm::vec4 ambient;
    glm::vec4 diffuse;
    glm::vec4 specular;
    glm::vec4 emissive;
  };

  // need to keep this 256 byte aligned (UBO range)
  struct Material {
    MaterialSide  sides[2];
    GLuint64      texturesADDR[4];
    GLuint        textures[4];
    GLuint        _pad[4+16];

    Material() {
      memset(this,0,sizeof(Material));
    }
  };

  // need to keep this 256 byte aligned (UBO range)
  struct MatrixNode {
    glm::mat4  worldMatrix;
    glm::mat4  worldMatrixIT;
    glm::mat4  objectMatrix;
    glm::mat4  objectMatrixIT;
  };

  struct Vertex {
    glm::vec4 position;
    glm::vec4 normal;
  };

  struct DrawRange {
    size_t        offset;
    int           count;

    DrawRange() : offset(0) , count(0) {}
  };

  struct DrawStateInfo {
    int           materialIndex;
    int           matrixIndex;

    friend bool operator != ( const DrawStateInfo &lhs,  const DrawStateInfo &rhs){
      return lhs.materialIndex != rhs.materialIndex || lhs.matrixIndex != rhs.matrixIndex;
    }

    friend bool operator == ( const DrawStateInfo &lhs,  const DrawStateInfo &rhs){
      return lhs.materialIndex == rhs.materialIndex && lhs.matrixIndex == rhs.matrixIndex;
    }
  };

  struct DrawRangeCache {
    std::vector<DrawStateInfo>    state;
    std::vector<int>          stateCount;

    std::vector<size_t>       offsets;
    std::vector<int>          counts;
  };

  struct GeometryPart {
    DrawRange     indexSolid;
    DrawRange     indexWire;
  };

  struct Geometry {
    GLuint    vboGL;
    GLuint    iboGL;
    GLuint64  vboADDR;
    GLuint64  iboADDR;
    size_t    vboSize;
    size_t    iboSize;

    std::vector<GeometryPart> parts;

    int       numVertices;
    int       numIndexSolid;
    int       numIndexWire;
    
    int       cloneIdx;
  };

  struct ObjectPart {
    int   active;
    int   materialIndex;
    int   matrixIndex;
  };

  struct Object {
    int             matrixIndex;
    int             geometryIndex;

    std::vector<ObjectPart> parts;

    DrawRangeCache  cacheSolid;
    DrawRangeCache  cacheWire;
  };

  std::vector<Material>       m_materials;
  std::vector<BBox>           m_geometryBboxes;
  std::vector<Geometry>       m_geometry;
  std::vector<MatrixNode>     m_matrices;
  std::vector<Object>         m_objects;
  std::vector<glm::ivec2>  m_objectAssigns;


  BBox      m_bbox;

  GLuint    m_materialsGL;
  GLuint64  m_materialsADDR;
  GLuint    m_matricesGL;
  GLuint64  m_matricesADDR;
  GLuint    m_matricesTexGL;
  GLuint64  m_matricesTexGLADDR;
  GLuint    m_geometryBboxesGL;
  GLuint    m_geometryBboxesTexGL;
  GLuint    m_objectAssignsGL;

  GLuint    m_parentIDsGL;

  GLuint    m_matricesOrigGL;
  GLuint    m_matricesOrigTexGL;

  NodeTree  m_nodeTree;

  void  updateObjectDrawCache(Object& object);
  
  bool  loadCSF(const char* filename, int clones = 0, int cloneaxis=3);
  void  unload();

  static void enableVertexFormat(int attrPos, int attrNormal);
  static void disableVertexFormat(int attrPos, int attrNormal);
  void resetMatrices();
};


#endif


================================================
FILE: common.h
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#define VERTEX_POS      0
#define VERTEX_NORMAL   1
#define VERTEX_ASSIGNS  2
#define VERTEX_WIREMODE 3

#define UBO_SCENE     0
#define UBO_MATRIX    1
#define UBO_MATERIAL  2

#define TEX_MATRICES  0

#define USE_BASEINSTANCE  0

//#define UNI_WIREFRAME 0


#ifdef __cplusplus
namespace csfviewer
{
  using namespace glm;
#endif

struct SceneData {
  mat4  viewProjMatrix;
  mat4  viewMatrix;
  mat4  viewMatrixIT;

  vec4  viewPos;
  vec4  viewDir;
  
  vec4  wLightPos;
  
  ivec2 viewport;
  uvec2 tboMatrices;
};

#ifdef __cplusplus
}
#endif


#if defined(GL_core_profile) || defined(GL_compatibility_profile) || defined(GL_es_profile)

#extension GL_NV_command_list : enable
#if GL_NV_command_list
layout(commandBindableNV) uniform;
#endif

// prevent this to be used by c++

layout(std140,binding=UBO_SCENE) uniform sceneBuffer {
  SceneData   scene;
};

// must match cadscene!
layout(std140,binding=UBO_MATRIX) uniform matrixBuffer {
  mat4 worldMatrix;
  mat4 worldMatrixIT;
  mat4 objectMatrix;
  mat4 objectMatrixIT;
} object;

#extension GL_ARB_bindless_texture : enable
#extension GL_NV_bindless_texture : enable
#if GL_NV_bindless_texture
#define matricesBuffer  samplerBuffer(scene.tboMatrices)
#else
layout(binding=TEX_MATRICES) uniform samplerBuffer matricesBuffer;
#endif
// must match cadscene!
#define NODE_MATRIX_WORLD     0
#define NODE_MATRIX_WORLDIT   1
#define NODE_MATRIX_OBJECT    2
#define NODE_MATRIX_OBJECTIT  3
#define NODE_MATRICES         4

mat4 getIndexedMatrix(int idx, int what)
{
  int i = idx * NODE_MATRICES + what;
  return mat4(  texelFetch(matricesBuffer, i*4 + 0),
                texelFetch(matricesBuffer, i*4 + 1),
                texelFetch(matricesBuffer, i*4 + 2),
                texelFetch(matricesBuffer, i*4 + 3));
}

#endif

================================================
FILE: csf.cpp
================================================
/*
 * Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#define CSF_IMPLEMENTATION
#define CSF_SUPPORT_GLTF2       1
#define CSF_SUPPORT_FILEMAPPING 1

#include <fileformats/cadscenefile.h>

#define CGLTF_IMPLEMENTATION
#include <cgltf.h>


================================================
FILE: csfviewer.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */

/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#define DEBUG_FILTER 1

#include <nvgl/extensions_gl.hpp>

#include <imgui/backends/imgui_impl_gl.h>
#include <imgui/imgui_helper.h>

#include <nvgl/glsltypes_gl.hpp>

#include <nvh/cameracontrol.hpp>
#include <nvh/fileoperations.hpp>
#include <nvh/geometry.hpp>
#include <nvh/misc.hpp>

#include <nvgl/appwindowprofiler_gl.hpp>
#include <nvgl/base_gl.hpp>
#include <nvgl/error_gl.hpp>
#include <nvgl/programmanager_gl.hpp>

#include "transformsystem.hpp"

#include "cadscene.hpp"
#include "renderer.hpp"

#include <algorithm>

#include "common.h"
#include "glm/gtc/matrix_access.hpp"
#include "glm/gtc/type_ptr.hpp"


namespace csfviewer {
int const SAMPLE_SIZE_WIDTH(800);
int const SAMPLE_SIZE_HEIGHT(600);
int const SAMPLE_MAJOR_VERSION(4);
int const SAMPLE_MINOR_VERSION(5);


class Sample : public nvgl::AppWindowProfilerGL
{
public:
  enum GuiEnums
  {
    GUI_RENDERER,
    GUI_MSAA,
    GUI_SHADE,
    GUI_STRATEGY,
  };

  struct
  {
    nvgl::ProgramID draw_object, draw_object_tris, draw_object_line, draw_object_indexed, draw_object_indexed_tris,
        draw_object_indexed_line,

        cull_object_frustum, cull_object_hiz, cull_object_raster, cull_bit_temporallast, cull_bit_temporalnew,
        cull_bit_regular, cull_depth_mips,

        scan_prefixsum, scan_offsets, scan_combine,

        transform_leaves, transform_level,

        xplode;

  } programs;

  struct
  {
    GLuint scene  = 0;
    GLuint scene2 = 0;
  } fbos;

  struct
  {
    GLuint scene_ubo = 0;
  } buffers;

  struct
  {
    GLuint64 scene_ubo;
  } addresses;

  struct
  {
    GLuint scene_color         = 0;
    GLuint scene_color2        = 0;
    GLuint scene_depthstencil  = 0;
    GLuint scene_depthstencil2 = 0;
  } textures;

  struct Tweak
  {
    int       renderer      = 0;
    ShadeType shade         = SHADE_SOLID;
    Strategy  strategy      = STRATEGY_GROUPS;
    int       clones        = 0;
    bool      cloneaxisX    = true;
    bool      cloneaxisY    = true;
    bool      cloneaxisZ    = false;
    bool      animateActive = false;
    float     animateMin    = 1;
    float     animateDelta  = 1;
    int       zoom          = 100;
    int       msaa          = 0;
    bool      noUI          = false;
  };

  nvgl::ProgramManager m_progManager;

  ImGuiH::Registry m_ui;
  double           m_uiTime = 0;

  Tweak m_tweak;
  Tweak m_lastTweak;

  std::string m_modelFilename;

  SceneData       m_sceneUbo;
  CadScene        m_scene;
  TransformSystem m_transformSystem;

  GLuint m_xplodeGroupSize;

  std::vector<unsigned int> m_renderersSorted;
  std::string               m_rendererName;

  Renderer* NV_RESTRICT m_renderer;
  Resources             m_resources;

  size_t m_stateChangeID;


  void updateProgramDefine();
  bool initProgram();
  bool initScene(const char* filename, int clones, int cloneaxis);
  bool initFramebuffers(int width, int height);
  void initRenderer(int type, Strategy strategy);
  void deinitRenderer();

  void getCullPrograms(CullingSystem::Programs& cullprograms);
  void getScanPrograms(ScanSystem::Programs& scanprograms);
  void getTransformPrograms(TransformSystem::Programs& xfromPrograms);

  void updatedPrograms();

  void setupConfigParameters();
  void setRendererFromName();


public:
  Sample() { setupConfigParameters(); }

  bool validateConfig() override;

  bool begin() override;
  void think(double time) override;
  void resize(int width, int height) override;

  void processUI(double time);

  nvh::CameraControl m_control;

  void end() override { ImGui::ShutdownGL(); }
  // return true to prevent m_windowState updates
  bool mouse_pos(int x, int y) override
  {
    if(m_tweak.noUI)
      return false;
    return ImGuiH::mouse_pos(x, y);
  }
  bool mouse_button(int button, int action) override
  {
    if(m_tweak.noUI)
      return false;
    return ImGuiH::mouse_button(button, action);
  }
  bool mouse_wheel(int wheel) override
  {
    if(m_tweak.noUI)
      return false;
    return ImGuiH::mouse_wheel(wheel);
  }
  bool key_char(int button) override
  {
    if(m_tweak.noUI)
      return false;
    return ImGuiH::key_char(button);
  }
  bool key_button(int button, int action, int mods) override
  {
    if(m_tweak.noUI)
      return false;
    return ImGuiH::key_button(button, action, mods);
  }
};

void Sample::updateProgramDefine() {}

void Sample::getTransformPrograms(TransformSystem::Programs& xformPrograms)
{
  xformPrograms.transform_leaves = m_progManager.get(programs.transform_leaves);
  xformPrograms.transform_level  = m_progManager.get(programs.transform_level);
}

void Sample::getCullPrograms(CullingSystem::Programs& cullprograms)
{
  cullprograms.bit_regular      = m_progManager.get(programs.cull_bit_regular);
  cullprograms.bit_temporallast = m_progManager.get(programs.cull_bit_temporallast);
  cullprograms.bit_temporalnew  = m_progManager.get(programs.cull_bit_temporalnew);
  cullprograms.depth_mips       = m_progManager.get(programs.cull_depth_mips);
  cullprograms.object_frustum   = m_progManager.get(programs.cull_object_frustum);
  cullprograms.object_hiz       = m_progManager.get(programs.cull_object_hiz);
  cullprograms.object_raster    = m_progManager.get(programs.cull_object_raster);
}

void Sample::getScanPrograms(ScanSystem::Programs& scanprograms)
{
  scanprograms.prefixsum = m_progManager.get(programs.scan_prefixsum);
  scanprograms.offsets   = m_progManager.get(programs.scan_offsets);
  scanprograms.combine   = m_progManager.get(programs.scan_combine);
}

bool Sample::initProgram()
{
  bool validated(true);
  m_progManager.m_filetype = nvh::ShaderFileManager::FILETYPE_GLSL;
  m_progManager.addDirectory(std::string("GLSL_" PROJECT_NAME));
  m_progManager.addDirectory(exePath() + std::string(PROJECT_RELDIRECTORY));

  m_progManager.registerInclude("common.h");

  updateProgramDefine();

  programs.draw_object =
      m_progManager.createProgram(nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "scene.vert.glsl"),
                                  nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "scene.frag.glsl"));

  programs.draw_object_tris = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define WIREMODE 0\n", "scene.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define WIREMODE 0\n", "scene.frag.glsl"));

  programs.draw_object_line = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define WIREMODE 1\n", "scene.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define WIREMODE 1\n", "scene.frag.glsl"));

  programs.draw_object_indexed = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define USE_INDEXING 1\n", "scene.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define USE_INDEXING 1\n", "scene.frag.glsl"));

  programs.draw_object_indexed_tris = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define USE_INDEXING 1\n#define WIREMODE 0\n",
                                       "scene.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define USE_INDEXING 1\n#define WIREMODE 0\n",
                                       "scene.frag.glsl"));

  programs.draw_object_indexed_line = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define USE_INDEXING 1\n#define WIREMODE 1\n",
                                       "scene.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define USE_INDEXING 1\n#define WIREMODE 1\n",
                                       "scene.frag.glsl"));


  programs.cull_object_raster = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define DUALINDEX 1\n#define MATRICES 4\n",
                                       "cull-raster.vert.glsl"),
      nvgl::ProgramManager::Definition(GL_GEOMETRY_SHADER, "#define DUALINDEX 1\n#define MATRICES 4\n",
                                       "cull-raster.geo.glsl"),
      nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "#define DUALINDEX 1\n#define MATRICES 4\n",
                                       "cull-raster.frag.glsl"));

  programs.cull_object_frustum = m_progManager.createProgram(nvgl::ProgramManager::Definition(
      GL_VERTEX_SHADER, "#define DUALINDEX 1\n#define MATRICES 4\n", "cull-xfb.vert.glsl"));

  programs.cull_object_hiz = m_progManager.createProgram(nvgl::ProgramManager::Definition(
      GL_VERTEX_SHADER, "#define DUALINDEX 1\n#define MATRICES 4\n#define OCCLUSION\n", "cull-xfb.vert.glsl"));

  programs.cull_bit_regular = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define TEMPORAL 0\n", "cull-bitpack.vert.glsl"));
  programs.cull_bit_temporallast = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define TEMPORAL TEMPORAL_LAST\n", "cull-bitpack.vert.glsl"));
  programs.cull_bit_temporalnew = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "#define TEMPORAL TEMPORAL_NEW\n", "cull-bitpack.vert.glsl"));

  programs.cull_depth_mips =
      m_progManager.createProgram(nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "cull-downsample.vert.glsl"),
                                  nvgl::ProgramManager::Definition(GL_FRAGMENT_SHADER, "cull-downsample.frag.glsl"));

  programs.scan_prefixsum = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "#define TASK TASK_SUM\n", "scan.comp.glsl"));
  programs.scan_offsets = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "#define TASK TASK_OFFSETS\n", "scan.comp.glsl"));
  programs.scan_combine = m_progManager.createProgram(
      nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "#define TASK TASK_COMBINE\n", "scan.comp.glsl"));

  programs.transform_leaves =
      m_progManager.createProgram(nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "transform-leaves.comp.glsl"));
  programs.transform_level =
      m_progManager.createProgram(nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "transform-level.comp.glsl"));

  programs.xplode =
      m_progManager.createProgram(nvgl::ProgramManager::Definition(GL_COMPUTE_SHADER, "xplode-animation.comp.glsl"));

  validated = m_progManager.areProgramsValid();

  return validated;
}

bool Sample::initScene(const char* filename, int clones, int cloneaxis)
{
  m_scene.unload();

  if(buffers.scene_ubo && has_GL_NV_shader_buffer_load)
  {
    glMakeNamedBufferNonResidentNV(buffers.scene_ubo);
  }

  nvgl::newBuffer(buffers.scene_ubo);
  glNamedBufferStorage(buffers.scene_ubo, sizeof(SceneData), NULL, GL_DYNAMIC_STORAGE_BIT);

  if(has_GL_NV_shader_buffer_load)
  {
    glGetNamedBufferParameterui64vNV(buffers.scene_ubo, GL_BUFFER_GPU_ADDRESS_NV, &addresses.scene_ubo);
    glMakeNamedBufferResidentNV(buffers.scene_ubo, GL_READ_ONLY);
  }

  m_resources.sceneUbo  = buffers.scene_ubo;
  m_resources.sceneAddr = addresses.scene_ubo;

  m_resources.stateChangeID++;

  bool status = m_scene.loadCSF(filename, clones, cloneaxis);

  LOGI("\nscene %s\n", filename);
  LOGI("geometries: %6d\n", (uint32_t)m_scene.m_geometry.size());
  LOGI("materials:  %6d\n", (uint32_t)m_scene.m_materials.size());
  LOGI("nodes:      %6d\n", (uint32_t)m_scene.m_matrices.size());
  LOGI("objects:    %6d\n", (uint32_t)m_scene.m_objects.size());
  LOGI("\n");

  return status;
}

bool Sample::initFramebuffers(int width, int height)
{
  bool layered = true;

  if(!fbos.scene || m_tweak.msaa != m_lastTweak.msaa)
  {
    nvgl::newFramebuffer(fbos.scene);
    nvgl::newFramebuffer(fbos.scene2);

    m_resources.fbo  = fbos.scene;
    m_resources.fbo2 = fbos.scene2;

    m_resources.stateChangeID++;
  }

  if(layered)
  {

    if(has_GL_NV_bindless_texture && textures.scene_color)
    {
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_color));
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_depthstencil));
    }

    nvgl::newTexture(textures.scene_color, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE_ARRAY : GL_TEXTURE_2D_ARRAY);
    nvgl::newTexture(textures.scene_depthstencil, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE_ARRAY : GL_TEXTURE_2D_ARRAY);

    if(m_tweak.msaa)
    {
      glTextureStorage3DMultisample(textures.scene_color, m_tweak.msaa, GL_RGBA8, width, height, 2, GL_TRUE);
      glTextureStorage3DMultisample(textures.scene_depthstencil, m_tweak.msaa, GL_DEPTH24_STENCIL8, width, height, 2, GL_TRUE);
    }
    else
    {
      glTextureStorage3D(textures.scene_color, 1, GL_RGBA8, width, height, 2);
      glTextureStorage3D(textures.scene_depthstencil, 1, GL_DEPTH24_STENCIL8, width, height, 2);
    }

    glNamedFramebufferTextureLayer(fbos.scene, GL_COLOR_ATTACHMENT0, textures.scene_color, 0, 0);
    glNamedFramebufferTextureLayer(fbos.scene, GL_DEPTH_STENCIL_ATTACHMENT, textures.scene_depthstencil, 0, 0);

    glNamedFramebufferTextureLayer(fbos.scene2, GL_COLOR_ATTACHMENT0, textures.scene_color, 0, 1);
    glNamedFramebufferTextureLayer(fbos.scene2, GL_DEPTH_STENCIL_ATTACHMENT, textures.scene_depthstencil, 0, 1);

    if(has_GL_NV_bindless_texture)
    {
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_color));
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_depthstencil));
    }
  }
  else
  {

    if(has_GL_NV_bindless_texture && textures.scene_color)
    {
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_color));
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_depthstencil));
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_color2));
      glMakeTextureHandleNonResidentNV(glGetTextureHandleNV(textures.scene_depthstencil2));
    }

    nvgl::newTexture(textures.scene_color, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D);
    nvgl::newTexture(textures.scene_depthstencil, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D);

    if(m_tweak.msaa)
    {
      glTextureStorage2DMultisample(textures.scene_color, 1, GL_RGBA8, width, height, GL_TRUE);
      glTextureStorage2DMultisample(textures.scene_depthstencil, 1, GL_DEPTH24_STENCIL8, width, height, GL_TRUE);
    }
    else
    {
      glTextureStorage2D(textures.scene_color, 1, GL_RGBA8, width, height);
      glTextureStorage2D(textures.scene_depthstencil, 1, GL_DEPTH24_STENCIL8, width, height);
    }

    glNamedFramebufferTexture(fbos.scene, GL_COLOR_ATTACHMENT0, textures.scene_color, 0);
    glNamedFramebufferTexture(fbos.scene, GL_DEPTH_STENCIL_ATTACHMENT, textures.scene_depthstencil, 0);

    nvgl::newTexture(textures.scene_color2, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D);
    nvgl::newTexture(textures.scene_depthstencil2, m_tweak.msaa ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D);

    if(m_tweak.msaa)
    {
      glTextureStorage2DMultisample(textures.scene_color2, 1, GL_RGBA8, width, height, GL_TRUE);
      glTextureStorage2DMultisample(textures.scene_depthstencil2, 1, GL_DEPTH24_STENCIL8, width, height, GL_TRUE);
    }
    else
    {
      glTextureStorage2D(textures.scene_color2, 1, GL_RGBA8, width, height);
      glTextureStorage2D(textures.scene_depthstencil2, 1, GL_DEPTH24_STENCIL8, width, height);
    }

    glNamedFramebufferTexture(fbos.scene2, GL_COLOR_ATTACHMENT0, textures.scene_color2, 0);
    glNamedFramebufferTexture(fbos.scene2, GL_DEPTH_STENCIL_ATTACHMENT, textures.scene_depthstencil2, 0);

    if(has_GL_NV_bindless_texture)
    {
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_color));
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_depthstencil));
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_color2));
      glMakeTextureHandleResidentNV(glGetTextureHandleNV(textures.scene_depthstencil2));
    }
  }

  m_resources.fboTextureChangeID++;

  return true;
}

void Sample::deinitRenderer()
{
  if(m_renderer)
  {
    m_renderer->deinit();
    delete m_renderer;
    m_renderer = NULL;
  }
}

void Sample::initRenderer(int type, Strategy strategy)
{
  deinitRenderer();
  Renderer::getRegistry()[m_renderersSorted[type]]->updatedPrograms(m_progManager);
  m_renderer             = Renderer::getRegistry()[m_renderersSorted[type]]->create();
  m_renderer->m_strategy = strategy;
  m_renderer->init(&m_scene, m_resources);
}

bool Sample::begin()
{
  m_renderer      = NULL;
  m_stateChangeID = 0;

  ImGuiH::Init(m_windowState.m_winSize[0], m_windowState.m_winSize[1], this);
  ImGui::InitGL();

  glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
  glEnable(GL_CULL_FACE);
  glEnable(GL_DEPTH_TEST);

#if defined(NDEBUG)
  setVsync(false);
#endif

  Renderer::s_bindless_ubo = !!m_contextWindow.extensionSupported("GL_NV_uniform_buffer_unified_memory");
  LOGI("\nNV_uniform_buffer_unified_memory support: %s\n\n", Renderer::s_bindless_ubo ? "true" : "false");

  bool validated(true);

  GLuint defaultVAO;
  glGenVertexArrays(1, &defaultVAO);
  glBindVertexArray(defaultVAO);

  validated = validated && initProgram();
  validated = validated && initScene(m_modelFilename.c_str(), 0, 3);
  validated = validated && initFramebuffers(m_windowState.m_winSize[0], m_windowState.m_winSize[1]);


  const Renderer::Registry registry = Renderer::getRegistry();
  for(size_t i = 0; i < registry.size(); i++)
  {
    if(registry[i]->isAvailable())
    {
      if(!registry[i]->loadPrograms(m_progManager))
      {
        LOGE("Failed to load resources for renderer %s\n", registry[i]->name());
        return false;
      }

      uint sortkey = uint(i);
      sortkey |= registry[i]->priority() << 16;
      m_renderersSorted.push_back(sortkey);
    }
  }

  std::sort(m_renderersSorted.begin(), m_renderersSorted.end());

  for(size_t i = 0; i < m_renderersSorted.size(); i++)
  {
    m_renderersSorted[i] &= 0xFFFF;

    m_ui.enumAdd(GUI_RENDERER, int(i), registry[m_renderersSorted[i]]->name());
  }

  {
    m_ui.enumAdd(GUI_STRATEGY, STRATEGY_INDIVIDUAL, "drawcall individual");
    m_ui.enumAdd(GUI_STRATEGY, STRATEGY_JOIN, "drawcall join");
    m_ui.enumAdd(GUI_STRATEGY, STRATEGY_GROUPS, "material groups");

    m_ui.enumAdd(GUI_SHADE, SHADE_SOLID, toString(SHADE_SOLID));
    m_ui.enumAdd(GUI_SHADE, SHADE_SOLIDWIRE, toString(SHADE_SOLIDWIRE));
    m_ui.enumAdd(GUI_SHADE, SHADE_SOLIDWIRE_SPLIT, "solid w edges (split test, only in sorted)");

    m_ui.enumAdd(GUI_MSAA, 0, "none");
    m_ui.enumAdd(GUI_MSAA, 2, "2x");
    m_ui.enumAdd(GUI_MSAA, 4, "4x");
    m_ui.enumAdd(GUI_MSAA, 8, "8x");
  }


  m_control.m_sceneOrbit     = glm::vec3(m_scene.m_bbox.max + m_scene.m_bbox.min) * 0.5f;
  m_control.m_sceneDimension = glm::length((m_scene.m_bbox.max - m_scene.m_bbox.min));
  m_control.m_viewMatrix =
      glm::lookAt(m_control.m_sceneOrbit - (-vec3(1, 1, 1) * m_control.m_sceneDimension * 0.5f * (float(m_tweak.zoom) / 100.0f)),
                      m_control.m_sceneOrbit, vec3(0, 1, 0));

  m_sceneUbo.wLightPos   = (m_scene.m_bbox.max + m_scene.m_bbox.min) * 0.5f + m_control.m_sceneDimension;
  m_sceneUbo.wLightPos.w = 1.0;

  updatedPrograms();

  CullingSystem::Programs cullprogs;
  getCullPrograms(cullprogs);
  Renderer::s_cullsys.init(cullprogs, true);

  ScanSystem::Programs scanprogs;
  getScanPrograms(scanprogs);
  Renderer::s_scansys.init(scanprogs);
  //Renderer::s_scansys.test();

  TransformSystem::Programs xformprogs;
  getTransformPrograms(xformprogs);
  m_transformSystem.init(xformprogs);


  initRenderer(m_tweak.renderer, m_tweak.strategy);

  return validated;
}

void Sample::processUI(double time)
{
  int width  = m_windowState.m_winSize[0];
  int height = m_windowState.m_winSize[1];

  // Update imgui configuration
  auto& imgui_io       = ImGui::GetIO();
  imgui_io.DeltaTime   = static_cast<float>(time - m_uiTime);
  imgui_io.DisplaySize = ImVec2(static_cast<float>(width), static_cast<float>(height));

  m_uiTime = time;

  ImGui::NewFrame();
  ImGui::SetNextWindowSize(ImGuiH::dpiScaled(350, 0), ImGuiCond_FirstUseEver);
  if(ImGui::Begin("NVIDIA " PROJECT_NAME, nullptr))
  {
    m_ui.enumCombobox(GUI_RENDERER, "renderer", &m_tweak.renderer);
    m_ui.enumCombobox(GUI_STRATEGY, "strategy", &m_tweak.strategy);
    m_ui.enumCombobox(GUI_SHADE, "shademode", &m_tweak.shade);
    ImGui::Checkbox("xplode via GPU", &m_tweak.animateActive);
    ImGui::SliderFloat("xplode min", &m_tweak.animateMin, 0, 16.0f);
    ImGui::SliderFloat("xplode delta", &m_tweak.animateDelta, 0, 16.0f);
    ImGuiH::InputIntClamped("clones", &m_tweak.clones, 0, 255, 1, 10, ImGuiInputTextFlags_EnterReturnsTrue);
    ImGui::Checkbox("clone X", &m_tweak.cloneaxisX);
    ImGui::Checkbox("clone Y", &m_tweak.cloneaxisY);
    ImGui::Checkbox("clone Z", &m_tweak.cloneaxisZ);
    m_ui.enumCombobox(GUI_MSAA, "msaa", &m_tweak.msaa);
  }
  if(!m_tweak.cloneaxisX && !m_tweak.cloneaxisY && !m_tweak.cloneaxisZ)
  {
    m_tweak.cloneaxisX = true;
  }

  ImGui::End();
}

void Sample::updatedPrograms()
{

  CullingSystem::Programs cullprogs;
  getCullPrograms(cullprogs);
  Renderer::s_cullsys.update(cullprogs, true);

  ScanSystem::Programs scanprogs;
  getScanPrograms(scanprogs);
  Renderer::s_scansys.update(scanprogs);

  TransformSystem::Programs xformprogs;
  getTransformPrograms(xformprogs);
  m_transformSystem.update(xformprogs);

  m_resources.programUbo     = m_progManager.get(programs.draw_object);
  m_resources.programUboLine = m_progManager.get(programs.draw_object_line);
  m_resources.programUboTris = m_progManager.get(programs.draw_object_tris);
  m_resources.programIdx     = m_progManager.get(programs.draw_object_indexed);
  m_resources.programIdxLine = m_progManager.get(programs.draw_object_indexed_line);
  m_resources.programIdxTris = m_progManager.get(programs.draw_object_indexed_tris);

  GLuint groupsizes[3];
  glGetProgramiv(m_progManager.get(programs.xplode), GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupsizes);
  m_xplodeGroupSize = groupsizes[0];

  m_resources.stateChangeID++;
}

void Sample::think(double time)
{
  NV_PROFILE_GL_SECTION("Frame");

  processUI(time);

  m_control.processActions({m_windowState.m_winSize[0], m_windowState.m_winSize[1]},
                           glm::vec2(m_windowState.m_mouseCurrent[0], m_windowState.m_mouseCurrent[1]),
                           m_windowState.m_mouseButtonFlags, m_windowState.m_mouseWheel);

  if(m_windowState.onPress(KEY_R))
  {
    m_progManager.reloadPrograms();
    Renderer::getRegistry()[m_tweak.renderer]->updatedPrograms(m_progManager);
    updatedPrograms();
  }

  if(m_tweak.msaa != m_lastTweak.msaa)
  {
    initFramebuffers(m_windowState.m_winSize[0], m_windowState.m_winSize[1]);
  }

  if(m_tweak.clones != m_lastTweak.clones || m_tweak.cloneaxisX != m_lastTweak.cloneaxisX
     || m_tweak.cloneaxisY != m_lastTweak.cloneaxisY || m_tweak.cloneaxisZ != m_lastTweak.cloneaxisZ)
  {
    deinitRenderer();
    initScene(m_modelFilename.c_str(), m_tweak.clones,
              (int(m_tweak.cloneaxisX) << 0) | (int(m_tweak.cloneaxisY) << 1) | (int(m_tweak.cloneaxisZ) << 2));
  }

  if(m_tweak.renderer != m_lastTweak.renderer || m_tweak.strategy != m_lastTweak.strategy
     || m_tweak.cloneaxisX != m_lastTweak.cloneaxisX || m_tweak.cloneaxisY != m_lastTweak.cloneaxisY
     || m_tweak.cloneaxisZ != m_lastTweak.cloneaxisZ || m_tweak.clones != m_lastTweak.clones)
  {
    initRenderer(m_tweak.renderer, m_tweak.strategy);
  }

  if(!m_tweak.animateActive && m_lastTweak.animateActive)
  {
    m_scene.resetMatrices();
  }

  m_lastTweak = m_tweak;

  int width  = m_windowState.m_winSize[0];
  int height = m_windowState.m_winSize[1];

  {
    // generic state setup
    glViewport(0, 0, width, height);

    if(m_tweak.shade == SHADE_SOLIDWIRE_SPLIT)
    {
      glBindFramebuffer(GL_FRAMEBUFFER, fbos.scene2);
      glClearColor(0.2f, 0.2f, 0.2f, 0.0f);
      glClearDepth(1.0);
      glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
    }

    glBindFramebuffer(GL_FRAMEBUFFER, fbos.scene);
    glClearColor(0.2f, 0.2f, 0.2f, 0.0f);
    glClearDepth(1.0);
    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);

    glEnable(GL_DEPTH_TEST);

    m_sceneUbo.viewport = ivec2(width, height);

    glm::mat4 projection = glm::perspectiveRH_ZO((45.f), float(width) / float(height),
                                                  m_control.m_sceneDimension * 0.001f, m_control.m_sceneDimension * 10.0f);
    glm::mat4 view       = m_control.m_viewMatrix;

    m_sceneUbo.viewProjMatrix = projection * view;
    m_sceneUbo.viewMatrix     = view;
    m_sceneUbo.viewMatrixIT   = glm::transpose(glm::inverse(view));

    m_sceneUbo.viewPos = glm::row(m_sceneUbo.viewMatrixIT, 3);
    m_sceneUbo.viewDir = -glm::row(view,2);

    m_sceneUbo.wLightPos   = glm::row(m_sceneUbo.viewMatrixIT, 3);
    m_sceneUbo.wLightPos.w = 1.0;

    m_sceneUbo.tboMatrices = uvec2(m_scene.m_matricesTexGLADDR & 0xFFFFFFFF, m_scene.m_matricesTexGLADDR >> 32);

    glNamedBufferSubData(buffers.scene_ubo, 0, sizeof(SceneData), &m_sceneUbo);

    glDisable(GL_CULL_FACE);
  }

  if(m_tweak.animateActive)
  {
    {
      NV_PROFILE_GL_SECTION("Xplode");

      float  speed      = 0.5;
      float  scale      = m_tweak.animateMin + (cosf(float(time) * speed) * 0.5f + 0.5f) * (m_tweak.animateDelta);
      GLuint totalNodes = GLuint(m_scene.m_matrices.size());
      GLuint groupsize  = m_xplodeGroupSize;

      glUseProgram(m_progManager.get(programs.xplode));
      glUniform1f(0, scale);
      glUniform1i(1, totalNodes);

      nvgl::bindMultiTexture(GL_TEXTURE0, GL_TEXTURE_BUFFER, m_scene.m_matricesOrigTexGL);
      glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, m_scene.m_matricesGL);

      glDispatchCompute((totalNodes + groupsize - 1) / groupsize, 1, 1);
      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

      nvgl::bindMultiTexture(GL_TEXTURE0, GL_TEXTURE_BUFFER, 0);
      glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, 0);
      glUseProgram(0);
    }

    {
      NV_PROFILE_GL_SECTION("Tree");
      TransformSystem::Buffer ids;
      TransformSystem::Buffer world;
      TransformSystem::Buffer object;

      ids.buffer = m_scene.m_parentIDsGL;
      ids.offset = 0;
      ids.size   = sizeof(GLuint) * m_scene.m_matrices.size();

      world.buffer = m_scene.m_matricesGL;
      world.offset = 0;
      world.size   = sizeof(CadScene::MatrixNode) * m_scene.m_matrices.size();

      object.buffer = m_scene.m_matricesGL;
      object.offset = 0;
      object.size   = sizeof(CadScene::MatrixNode) * m_scene.m_matrices.size();

      m_transformSystem.process(m_scene.m_nodeTree, ids, object, world);
    }
  }

  {
    NV_PROFILE_GL_SECTION("Render");

    m_resources.cullView.viewPos        = glm::value_ptr(m_sceneUbo.viewPos);
    m_resources.cullView.viewDir        = glm::value_ptr(m_sceneUbo.viewDir);
    m_resources.cullView.viewProjMatrix = glm::value_ptr(m_sceneUbo.viewProjMatrix);

    m_renderer->draw(m_tweak.shade, m_resources, m_profiler, m_progManager);
  }


  {
    NV_PROFILE_GL_SECTION("Blit");


    if(m_tweak.shade == SHADE_SOLIDWIRE_SPLIT)
    {
      glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);

      int wh = width / 2;
      int hh = height / 2;

      glBindFramebuffer(GL_READ_FRAMEBUFFER, fbos.scene);
      glBlitFramebuffer(0, 0, wh, hh, 0, 0, wh, hh, GL_COLOR_BUFFER_BIT, GL_NEAREST);
      glBlitFramebuffer(wh, hh, width, height, wh, hh, width, height, GL_COLOR_BUFFER_BIT, GL_NEAREST);

      glBindFramebuffer(GL_READ_FRAMEBUFFER, fbos.scene2);
      glBlitFramebuffer(wh, 0, width, hh, wh, 0, width, hh, GL_COLOR_BUFFER_BIT, GL_NEAREST);
      glBlitFramebuffer(0, hh, wh, height, 0, hh, wh, height, GL_COLOR_BUFFER_BIT, GL_NEAREST);
    }
    else
    {
      // blit to background
      glBindFramebuffer(GL_READ_FRAMEBUFFER, fbos.scene);
      glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
      glBlitFramebuffer(0, 0, width, height, 0, 0, width, height, GL_COLOR_BUFFER_BIT, GL_NEAREST);
    }
  }

  if(!m_tweak.noUI)
  {
    NV_PROFILE_GL_SECTION("GUI");
    ImGui::Render();
    ImGui::RenderDrawDataGL(ImGui::GetDrawData());
  }

  ImGui::EndFrame();

  m_lastTweak = m_tweak;
}

void Sample::resize(int width, int height)
{
  initFramebuffers(width, height);
}

void Sample::setRendererFromName()
{
  if(!m_rendererName.empty())
  {
    const Renderer::Registry registry = Renderer::getRegistry();
    for(size_t i = 0; i < m_renderersSorted.size(); i++)
    {
      if(strcmp(m_rendererName.c_str(), registry[m_renderersSorted[i]]->name()) == 0)
      {
        m_tweak.renderer = int(i);
      }
    }
  }
}

static std::string addPath(std::string const& defaultPath, std::string const& filename)
{
  if(
#ifdef _WIN32
      filename.find(':') != std::string::npos
#else
      !filename.empty() && filename[0] == '/'
#endif
  )
  {
    return filename;
  }
  else
  {
    return defaultPath + "/" + filename;
  }
}

static bool endsWith(std::string const& s, std::string const& end)
{
  if(s.length() >= end.length())
  {
    return (0 == s.compare(s.length() - end.length(), end.length(), end));
  }
  else
  {
    return false;
  }
}

void Sample::setupConfigParameters()
{
  m_parameterList.addFilename(".csf", &m_modelFilename);
  m_parameterList.addFilename(".csf.gz", &m_modelFilename);
  m_parameterList.addFilename(".gltf", &m_modelFilename);

  m_parameterList.add("noui", &m_tweak.noUI, false);

  m_parameterList.add("renderer", (uint32_t*)&m_tweak.renderer);
  m_parameterList.add("renderernamed", &m_rendererName);
  m_parameterList.add("strategy", (uint32_t*)&m_tweak.strategy);
  m_parameterList.add("shademode", (uint32_t*)&m_tweak.shade);
  m_parameterList.add("msaa", &m_tweak.msaa);
  m_parameterList.add("clones", &m_tweak.clones);
  m_parameterList.add("xplode", &m_tweak.animateActive);
  m_parameterList.add("zoom", &m_tweak.zoom);
}


bool Sample::validateConfig()
{
  if(m_modelFilename.empty())
  {
    LOGI("no .csf model file specified\n");
    LOGI("exe <filename.csf/cfg> parameters...\n");
    m_parameterList.print();
    return false;
  }
  return true;
}

}  // namespace csfviewer

using namespace csfviewer;

int main(int argc, const char** argv)
{
  NVPSystem system(PROJECT_NAME);

  Sample sample;

  {
    std::vector<std::string> directories;
    directories.push_back(NVPSystem::exePath());
    directories.push_back(NVPSystem::exePath() + "/media");
    directories.push_back(NVPSystem::exePath() + std::string(PROJECT_DOWNLOAD_RELDIRECTORY));
    sample.m_modelFilename = nvh::findFile(std::string("geforce.csf.gz"), directories);
  }

  return sample.run(PROJECT_NAME, argc, argv, SAMPLE_SIZE_WIDTH, SAMPLE_SIZE_HEIGHT);
}


================================================
FILE: cull-bitpack.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 330
/**/

#define TEMPORAL_LAST 1
#define TEMPORAL_NEW  2

#ifndef TEMPORAL
#define TEMPORAL 0
#endif

#extension GL_ARB_explicit_attrib_location : require
#extension GL_ARB_shader_storage_buffer_object : enable

layout(location=0) in uvec4 instream[8];

#if TEMPORAL
layout(location=9) in uint last;
#endif

#if GL_ARB_shader_storage_buffer_object
layout(std430,binding=0)  writeonly buffer outputBuffer {
  uint outstream[];
};

void storeOutput(uint value)
{
  outstream[gl_VertexID] = value;
}

#else
flat out uint outstream;

void storeOutput(uint value)
{
  outstream= value;
}
#endif

void main ()
{
  uint bits = 0u;
  int outbit = 0;
  for (int i = 0; i < 8; i++){
    for (int n = 0; n < 4; n++, outbit++){
      uint checkbytes = instream[i][n];
      bits |= (checkbytes & 1u) << outbit;
    }
  }
  
#if TEMPORAL == TEMPORAL_LAST
  // render what was visible in last frame and passes current test
  bits &= last;
#elif TEMPORAL == TEMPORAL_NEW
  // render what was not visible in last frame (already rendered), but is now visible
  bits &= (~last);
#endif

  storeOutput(bits);
}


================================================
FILE: cull-downsample.frag.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 330 
/**/

uniform sampler2D depthTex;
uniform int       depthLod;
uniform bool      evenLod;

in vec2 uv;

void main()
{
  ivec2 lodSize = textureSize(depthTex,depthLod);
  float depth = 0;
  
  if (evenLod){
    ivec2 offsets[] = ivec2[](
      ivec2(0,0),
      ivec2(0,1),
      ivec2(1,1),
      ivec2(1,0)
    );
    ivec2 coord = ivec2(gl_FragCoord.xy);
    coord *= 2;
    
    for (int i = 0; i < 4; i++){
      depth = max(
        depth, 
        texelFetch(depthTex,
          clamp(coord + offsets[i], ivec2(0), lodSize - ivec2(1)),
          depthLod).r );
    }
  }
  else{
    // need this to handle non-power of two
    // very conservative
    
    vec2 offsets[] = vec2[](
      vec2(-1,-1),
      vec2( 0,-1),
      vec2( 1,-1),
      vec2(-1, 0),
      vec2( 0, 0),
      vec2( 1, 0),
      vec2(-1, 1),
      vec2( 0, 1),
      vec2( 1, 1)
    );
    vec2 coord = uv;
    vec2 texel = 1.0/(vec2(lodSize));
    
    for (int i = 0; i < 9; i++){
      vec2 pos = coord + offsets[i] * texel;
      depth = max(
        depth, 
        #if 1
        texelFetch(depthTex,
          clamp(ivec2(pos * lodSize), ivec2(0), lodSize - ivec2(1)),
          depthLod).r 
        #else
        textureLod(depthTex,
          pos,
          depthLod).r 
        #endif
        );
    }
  }

  gl_FragDepth = depth;
}


================================================
FILE: cull-downsample.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 330
/**/

out vec2 uv;

void main()
{
  vec4 pos =  vec4(
      (float( gl_VertexID    &1)) * 4.0 - 1.0,
      (float((gl_VertexID>>1)&1)) * 4.0 - 1.0,
      0, 1.0);
      
  uv = pos.xy * 0.5 + 0.5;
  
  gl_Position = pos;
}


================================================
FILE: cull-raster.frag.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

layout(early_fragment_tests) in;

layout(std430,binding=0) buffer visibleBuffer {
  int visibles[];
};

layout(location=0,index=0) out vec4 out_Color;

flat in int objid;

void main (){
  visibles[objid] = 1;
  
  out_Color = unpackUnorm4x8(uint(objid));
}


================================================
FILE: cull-raster.geo.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#ifndef MATRIX_WORLD
#define MATRIX_WORLD    0
#endif

#ifndef MATRIX_WORLD_IT
#define MATRIX_WORLD_IT 1
#endif

#ifndef MATRICES
#define MATRICES        2
#endif

#ifndef FLIPWIND
#define FLIPWIND        1
#endif

#ifndef PERSPECTIVE
#define PERSPECTIVE     1
#endif

// render the 3 visible sides based on view direction and box normal
layout(points,invocations=3) in;  

// one side each invocation
layout(triangle_strip,max_vertices=4) out;

in VertexOut{
  vec3 bboxCtr;
  vec3 bboxDim;
  flat int matrixIndex;
  flat int objid;
} IN[1];

flat out int objid;

uniform vec3 viewPos;
uniform vec3 viewDir;
uniform mat4 viewProjTM;
uniform samplerBuffer matricesTex;

void main()
{

  int  matindex = (IN[0].matrixIndex*MATRICES + MATRIX_WORLD)*4;
  mat4 worldTM = mat4(
    texelFetch(matricesTex,matindex + 0),
    texelFetch(matricesTex,matindex + 1),
    texelFetch(matricesTex,matindex + 2),
    texelFetch(matricesTex,matindex + 3));

  vec3 faceNormal = vec3(0);
  vec3 edgeBasis0 = vec3(0);
  vec3 edgeBasis1 = vec3(0);
  
  int id = gl_InvocationID;

  if (id == 0)
  {
      faceNormal.x = IN[0].bboxDim.x;
      edgeBasis0.y = IN[0].bboxDim.y;
      edgeBasis1.z = IN[0].bboxDim.z;
  }
  else if(id == 1)
  {
      faceNormal.y = IN[0].bboxDim.y;
      edgeBasis1.x = IN[0].bboxDim.x;
      edgeBasis0.z = IN[0].bboxDim.z;
  }
  else if(id == 2)
  {
      faceNormal.z = IN[0].bboxDim.z;
      edgeBasis0.x = IN[0].bboxDim.x;
      edgeBasis1.y = IN[0].bboxDim.y;
  }
  
  vec3 worldCtr = (worldTM * vec4(IN[0].bboxCtr, 1)).xyz;
  
#if PERSPECTIVE
  vec3 worldNormal = mat3(worldTM) * faceNormal;
  vec3 worldPos    = worldCtr + worldNormal;
  float proj = sign(dot(worldPos - viewPos.xyz, worldNormal));
#else
  vec3 worldNormal = mat3(worldTM) * faceNormal;
  float proj = sign(dot(viewDir,worldNormal));
#endif
  
#if FLIPWIND
  proj *= -1;
#endif
  
  
  faceNormal = mat3(worldTM) * (faceNormal) * proj;
  edgeBasis0 = mat3(worldTM) * (edgeBasis0);
  edgeBasis1 = mat3(worldTM) * (edgeBasis1) * proj;
  
#if FLIPWIND
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal - edgeBasis0 - edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal + edgeBasis0 - edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal - edgeBasis0 + edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal + edgeBasis0 + edgeBasis1),1);
  EmitVertex();
  
#else
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal - edgeBasis0 - edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal - edgeBasis0 + edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal + edgeBasis0 - edgeBasis1),1);
  EmitVertex();
  
  objid = IN[0].objid;
  gl_Position = viewProjTM * vec4(worldCtr + (faceNormal + edgeBasis0 + edgeBasis1),1);
  EmitVertex();
#endif
  
}


================================================
FILE: cull-raster.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#ifndef MATRIX_WORLD
#define MATRIX_WORLD    0
#endif

#ifndef MATRIX_WORLD_IT
#define MATRIX_WORLD_IT 1
#endif

#ifndef MATRICES
#define MATRICES        2
#endif

layout(std430,binding=0) buffer visibleBuffer {
  int visibles[];
};

uniform samplerBuffer matricesTex;

#ifdef DUALINDEX
layout(location=0) in int  bboxIndex;
layout(location=2) in int  matrixIndex;
uniform samplerBuffer     bboxesTex;

vec4 bboxMin = texelFetch(bboxesTex, bboxIndex*2+0);
vec4 bboxMax = texelFetch(bboxesTex, bboxIndex*2+1);
#else
layout(location=0) in vec4 bboxMin;
layout(location=1) in vec4 bboxMax;
layout(location=2) in int  matrixIndex;
#endif

uniform vec3 viewPos;

out VertexOut{
  vec3 bboxCtr;
  vec3 bboxDim;
  flat int matrixIndex;
  flat int objid;
} OUT;

void main()
{
  int objid = gl_VertexID;
  vec3 ctr =((bboxMin + bboxMax)*0.5).xyz;
  vec3 dim =((bboxMax - bboxMin)*0.5).xyz;
  OUT.bboxCtr = ctr;
  OUT.bboxDim = dim;
  OUT.matrixIndex = matrixIndex;
  OUT.objid = objid;
  
  {
    // if camera is inside the bbox then none of our
    // side faces will be visible, must treat object as 
    // visible
    int matindex = (matrixIndex * MATRICES + MATRIX_WORLD_IT)*4;
    mat4 worldInvTransTM = mat4(
      texelFetch(matricesTex,matindex + 0),
      texelFetch(matricesTex,matindex + 1),
      texelFetch(matricesTex,matindex + 2),
      texelFetch(matricesTex,matindex + 3));
      
    vec3 objPos = (vec4(viewPos,1) * worldInvTransTM).xyz;
    objPos -= ctr;
    if (all(lessThan(abs(objPos),dim))){
      // inside bbox
      visibles[objid] = 1;
    }
  }
}


================================================
FILE: cull-tokencmds.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 440
/**/

#define SCAN_BATCHSIZE 2048

layout(location=0) in uint  cmdOffset;
layout(location=1) in uint  cmdCullSize;
layout(location=2) in uint  cmdCullScan;

uniform uint startOffset;
uniform int  startID;
uniform uint endOffset;
uniform int  endID;
uniform uint terminateCmd;

layout(std430,binding=0)  writeonly buffer outputBuffer {
  uint outcmds[];
};

layout(std430,binding=1)  readonly buffer commandBuffer {
  uint incmds[];
};

layout(std430,binding=2)  readonly buffer cullSizesBuffer {
  uint cullSizes[];
};

layout(std430,binding=3)  readonly buffer cullScanBuffer {
  uint cullScan[];
};

layout(std430,binding=4)  readonly buffer cullScanOffsetBuffer {
  uint cullScanOffsets[];
};

uint getOffset( int id, uint scan, uint size, bool exclusive)
{
  int scanBatch = id / SCAN_BATCHSIZE;
  uint  scanOffset  = scan;
        scanOffset += scanBatch > 0 ? cullScanOffsets[ scanBatch-1] : 0;
  
  if (exclusive){
    scanOffset -= size;
  }
  return scanOffset;
}

uint getOffset( int id, bool exclusive)
{
  return getOffset(id, cullScan[id], cullSizes[id], exclusive);
}

uint rebaseOffset(uint cullOffset)
{
  // where the current sequence starts
  uint startCullOffset = getOffset(startID, true);

  // rebase from where it should start
  uint outOffset    = startOffset + (cullOffset - startCullOffset);
  
  return outOffset;
}

#define DEBUG 0

void main ()
{
  if (cmdCullSize > 0)
  {
    // cullOffset goes across "stateobject" sequences
    uint cullOffset = getOffset(gl_VertexID,cmdCullScan,cmdCullSize,true);
  
    uint outOffset  = rebaseOffset(cullOffset);
    
  #if DEBUG
    outcmds[(gl_VertexID)*2+0] = outOffset;
    outcmds[(gl_VertexID)*2+1] = cmdOffset;
  #else
    for (uint i = 0; i < cmdCullSize; i++){
      outcmds[outOffset+i] = incmds[cmdOffset+i];
    }
  #endif
  }
#if DEBUG
  else {
    outcmds[(gl_VertexID)*2+0] = ~0;
    outcmds[(gl_VertexID)*2+1] = cmdOffset;
  }
#endif

  if (gl_VertexID == startID)
  {
    // add terminator if sequence not original
    uint lastOffset = rebaseOffset( getOffset(endID, false) );
    if (lastOffset != endOffset) {
#if !DEBUG
      outcmds[lastOffset] = terminateCmd;
#endif
    }
    
#if DEBUG && 0
    outcmds[(startID)*2+0] = lastOffset;
    outcmds[(startID)*2+1] = endOffset;
#endif
  }
}


================================================
FILE: cull-tokensizes.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 440
/**/

layout(location=0) in uint  cmdSize;
layout(location=1) in int   cmdObject;

layout(std430,binding=0)  writeonly buffer outputBuffer {
  uint outsizes[];
};

layout(std430,binding=1)  readonly buffer visibleBuffer {
  int visibles[];
};

#define DEBUG false

void main ()
{
  if (cmdObject >= 0 && !DEBUG){
    outsizes[gl_VertexID] = (visibles[cmdObject/32] & (1<<(cmdObject%32))) != 0 ? cmdSize : 0;
  }
  else{
    outsizes[gl_VertexID] = cmdSize;
  }
}


================================================
FILE: cull-xfb.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 330
/**/

#ifndef MATRIX_WORLD
#define MATRIX_WORLD    0
#endif

#ifndef MATRIX_WORLD_IT
#define MATRIX_WORLD_IT 1
#endif

#ifndef MATRICES
#define MATRICES        2
#endif

#extension GL_ARB_explicit_attrib_location : require
#extension GL_ARB_shader_storage_buffer_object : enable


//#define OCCLUSION

#ifdef DUALINDEX
layout(location=0) in int  bboxIndex;
layout(location=2) in int  matrixIndex;

uniform samplerBuffer     bboxesTex;
vec4 bboxMin = texelFetch(bboxesTex, bboxIndex*2+0);
vec4 bboxMax = texelFetch(bboxesTex, bboxIndex*2+1);
#else
layout(location=0) in vec4 bboxMin;
layout(location=1) in vec4 bboxMax;
layout(location=2) in int  matrixIndex;
#endif

#if GL_ARB_shader_storage_buffer_object
layout(std430,binding=0)  writeonly buffer outputBuffer {
  int outstream[];
};

void storeOutput(int value)
{
  outstream[gl_VertexID] = value;
}

#else
flat out int outstream;

void storeOutput(int value)
{
  outstream = value;
}
#endif

uniform mat4              viewProjTM;
uniform samplerBuffer     matricesTex;

#ifdef OCCLUSION
uniform sampler2D         depthTex;
#endif

vec4 getBoxCorner(int n)
{
#if 1
  bvec3 useMax = bvec3((n & 1) != 0, (n & 2) != 0, (n & 4) != 0);
  return vec4(mix(bboxMin.xyz, bboxMax.xyz, useMax),1);
#else
  switch(n){
  case 0:
    return vec4(bboxMin.x,bboxMin.y,bboxMin.z,1);
  case 1:
    return vec4(bboxMax.x,bboxMin.y,bboxMin.z,1);
  case 2:
    return vec4(bboxMin.x,bboxMax.y,bboxMin.z,1);
  case 3:
    return vec4(bboxMax.x,bboxMax.y,bboxMin.z,1);
  case 4:
    return vec4(bboxMin.x,bboxMin.y,bboxMax.z,1);
  case 5:
    return vec4(bboxMax.x,bboxMin.y,bboxMax.z,1);
  case 6:
    return vec4(bboxMin.x,bboxMax.y,bboxMax.z,1);
  case 7:
    return vec4(bboxMax.x,bboxMax.y,bboxMax.z,1);
  }
#endif
}

vec3 projected(mat4 a, vec4 pos) {
  vec4 hpos = (a * pos);
  return hpos.xyz/hpos.w;
}

void main (){
  int isvisible = 0;
  int matindex = (matrixIndex*MATRICES + MATRIX_WORLD)*4;
  mat4 worldTM = mat4(
    texelFetch(matricesTex,matindex + 0),
    texelFetch(matricesTex,matindex + 1),
    texelFetch(matricesTex,matindex + 2),
    texelFetch(matricesTex,matindex + 3));
    
  mat4 worldViewProjTM = (viewProjTM * worldTM);
  
  // clipspace bbox
  vec3 clipmin  = projected(worldViewProjTM, getBoxCorner(0));
  vec3 clipmax  = clipmin;

  for (int n = 1; n < 8; n++){
    vec3 ab = projected(worldViewProjTM, getBoxCorner(n));
    clipmin = min(clipmin,ab);
    clipmax = max(clipmax,ab);
  }

  isvisible = (
    clipmin.x <= 1 &&
    clipmin.y <= 1 &&
    clipmin.z <= 1 &&
    clipmax.x >= -1 &&
    clipmax.y >= -1 &&
    clipmax.z >= -1) ? 1 : 0;

#ifdef OCCLUSION
  if (isvisible != 0){
    clipmin = clipmin * 0.5 + 0.5;
    clipmax = clipmax * 0.5 + 0.5;
    vec2 size = (clipmax.xy - clipmin.xy);
    ivec2 texsize = textureSize(depthTex,0);
    float maxsize = max(size.x, size.y) * float(max(texsize.x,texsize.y));
    float miplevel = ceil(log2(maxsize));
    
    float depth = 0;
    float a = textureLod(depthTex,clipmin.xy,miplevel).r;
    float b = textureLod(depthTex,vec2(clipmax.x,clipmin.y),miplevel).r;
    float c = textureLod(depthTex,clipmax.xy,miplevel).r;
    float d = textureLod(depthTex,vec2(clipmin.x,clipmax.y),miplevel).r;
    depth = max(depth,max(max(max(a,b),c),d));

    isvisible =  clipmin.z <= depth ? 1 : 0;
  }
#endif

  storeOutput(isvisible);
}


================================================
FILE: cullingsystem.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "cullingsystem.hpp"
#include <assert.h>
#include <string.h>

#define DEBUG_VISIBLEBOXES  0

inline unsigned int minDivide(unsigned int val, unsigned int alignment)
{
  return (val+alignment-1)/alignment;
}

void CullingSystem::init( const Programs &programs, bool dualindex )
{
  update(programs,dualindex);
  glGenFramebuffers(1,&m_fbo);
  glCreateTextures(GL_TEXTURE_BUFFER,2,m_tbo);
}

void CullingSystem::update( const Programs &programs, bool dualindex )
{
  m_programs = programs;
  m_dualindex = dualindex;
  m_useSSBO = has_GL_VERSION_4_2 != 0;
  m_useRepesentativeTest = !!has_GL_NV_representative_fragment_test;

  if (!m_useSSBO)
  {
    const char* xfbstreams[] = {"outstream"};
    glTransformFeedbackVaryings(programs.bit_regular,1,xfbstreams,GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(programs.bit_regular);

    glTransformFeedbackVaryings(programs.bit_temporallast,1,xfbstreams,GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(programs.bit_temporallast);

    glTransformFeedbackVaryings(programs.bit_temporalnew,1,xfbstreams,GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(programs.bit_temporalnew);

    glTransformFeedbackVaryings(programs.object_frustum,1,xfbstreams,GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(programs.object_frustum);

    glTransformFeedbackVaryings(programs.object_hiz,1,xfbstreams,GL_INTERLEAVED_ATTRIBS);
    glLinkProgram(programs.object_hiz);
  }

  glUseProgram(programs.depth_mips);
  glUniform1i(glGetUniformLocation(programs.depth_mips,"depthTex"),0);
  m_uniforms.depth_lod = glGetUniformLocation(programs.depth_mips,"depthLod");
  m_uniforms.depth_even = glGetUniformLocation(programs.depth_mips,"evenLod");

  glUseProgram(programs.object_frustum);
  glUniform1i(glGetUniformLocation(programs.object_frustum,"matricesTex"),0);
  if (dualindex){
    glUniform1i(glGetUniformLocation(programs.object_frustum,"bboxesTex"),1);
  }
  m_uniforms.frustum_viewProj = glGetUniformLocation(programs.object_frustum, "viewProjTM");

  glUseProgram(programs.object_hiz);
  glUniform1i(glGetUniformLocation(programs.object_hiz,"matricesTex"),0);
  if (dualindex){
    glUniform1i(glGetUniformLocation(programs.object_frustum,"bboxesTex"),1);
  }
  glUniform1i(glGetUniformLocation(programs.object_hiz,"depthTex"),2);
  m_uniforms.hiz_viewProj = glGetUniformLocation(programs.object_hiz, "viewProjTM");
  
  glUseProgram(programs.object_raster);
  glUniform1i(glGetUniformLocation(programs.object_raster,"matricesTex"),0);
  if (dualindex){
    glUniform1i(glGetUniformLocation(programs.object_frustum,"bboxesTex"),1);
  }
  m_uniforms.raster_viewProj = glGetUniformLocation(programs.object_raster, "viewProjTM");
  m_uniforms.raster_viewPos  = glGetUniformLocation(programs.object_raster, "viewPos");
  m_uniforms.raster_viewDir  = glGetUniformLocation(programs.object_raster, "viewDir");

  glUseProgram(0);
}

void CullingSystem::deinit()
{
  glDeleteFramebuffers(1,&m_fbo);
  glDeleteTextures(2,m_tbo);
}

void CullingSystem::buildDepthMipmaps( GLuint textureDepth, int width, int height )
{
  int level = 0;
  int dim = width > height ? width : height;
  int twidth  = width;
  int theight = height;
  int wasEven = 0;

  glBindFramebuffer(GL_FRAMEBUFFER,m_fbo);
  glDepthFunc(GL_ALWAYS);
  glUseProgram(m_programs.depth_mips);
  glActiveTexture(GL_TEXTURE0);
  glBindTexture(GL_TEXTURE_2D, textureDepth);


  while (dim){
    if (level){
      twidth  = twidth < 1 ? 1 : twidth;
      theight = theight < 1 ? 1 : theight;
      glViewport(0,0,twidth,theight);
      glFramebufferTexture2D(GL_FRAMEBUFFER,GL_DEPTH_STENCIL_ATTACHMENT,GL_TEXTURE_2D, textureDepth, level);
      glUniform1i(m_uniforms.depth_lod, level-1);
      glUniform1i(m_uniforms.depth_even, wasEven);

      glDrawArrays(GL_TRIANGLES,0,3);
    }

    wasEven = (twidth % 2 == 0) && (theight % 2 == 0);
    
    dim       /=  2;
    twidth    /=  2;
    theight   /=  2;
    level++;
  }

  glUseProgram(0);
  glViewport(0,0,width,height);
  glBindFramebuffer(GL_FRAMEBUFFER,0);
  glBindTexture(GL_TEXTURE_2D, 0);
  glDepthFunc(GL_LEQUAL);
  glViewport(0,0,width,height);
}


void CullingSystem::testBboxes( Job &job, bool raster )
{
  // send the scene's bboxes as points stream

  glBindBuffer(GL_ARRAY_BUFFER, job.m_bufferObjectBbox.buffer);
  if (m_dualindex){
    glVertexAttribIPointer(0, 1, GL_INT, job.m_bufferObjectBbox.stride, (const void*) job.m_bufferObjectBbox.offset);
    glVertexAttribDivisor(0, 0);
    glEnableVertexAttribArray(0);
  }
  else{
    GLsizei stride = job.m_bufferObjectBbox.stride ? job.m_bufferObjectBbox.stride : sizeof(float)*4*2;
    glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, stride, (const void*)job.m_bufferObjectBbox.offset);
    glVertexAttribDivisor(0, 0);
    glEnableVertexAttribArray(0);
    glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, stride, (const void*)(sizeof(float)*4 + job.m_bufferObjectBbox.offset));
    glVertexAttribDivisor(1, 0);
    glEnableVertexAttribArray(1);
  }
  
  glBindBuffer(GL_ARRAY_BUFFER, job.m_bufferObjectMatrix.buffer);
  glVertexAttribIPointer(2, 1, GL_INT, job.m_bufferObjectMatrix.stride, (const void*) job.m_bufferObjectMatrix.offset);
  glVertexAttribDivisor(2, 0);
  glEnableVertexAttribArray(2);
  glBindBuffer(GL_ARRAY_BUFFER, 0);
  
  glActiveTexture(GL_TEXTURE0);
  glBindTexture(GL_TEXTURE_BUFFER, m_tbo[0]);
  job.m_bufferMatrices.TexBuffer(GL_TEXTURE_BUFFER,GL_RGBA32F);

  if (m_dualindex){
    glActiveTexture(GL_TEXTURE1);
    glBindTexture(GL_TEXTURE_BUFFER, m_tbo[1]);
    job.m_bufferBboxes.TexBuffer(GL_TEXTURE_BUFFER,GL_RGBA32F);
  }

  if (raster){
    if (m_useRepesentativeTest) {
      glEnable( GL_REPRESENTATIVE_FRAGMENT_TEST_NV );
    }
#if !DEBUG_VISIBLEBOXES
    glDepthMask(GL_FALSE);
    glColorMask(GL_FALSE,GL_FALSE,GL_FALSE,GL_FALSE);
#endif
  }
  else if (m_useSSBO){
    glEnable(GL_RASTERIZER_DISCARD);
    job.m_bufferVisOutput.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
  }
  else{
    glEnable(GL_RASTERIZER_DISCARD);
    // setup transform feedback
    job.m_bufferVisOutput.BindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,0);
    glBeginTransformFeedback(GL_POINTS);
  }

  glDrawArrays(GL_POINTS,0,job.m_numObjects);

  if (raster){
    if (m_useRepesentativeTest) {
      glDisable( GL_REPRESENTATIVE_FRAGMENT_TEST_NV );
    }
#if !DEBUG_VISIBLEBOXES
    glDepthMask(GL_TRUE);
    glColorMask(GL_TRUE,GL_TRUE,GL_TRUE,GL_TRUE);
#endif
  }
  else if (m_useSSBO){
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER,0,0);
    glDisable(GL_RASTERIZER_DISCARD);
  }
  else{
    glEndTransformFeedback();
    glBindBufferBase(GL_TRANSFORM_FEEDBACK_BUFFER,0,0);
    glDisable(GL_RASTERIZER_DISCARD);
  }

  if (m_dualindex){
    glBindTexture(GL_TEXTURE_BUFFER, 0);
    glActiveTexture(GL_TEXTURE0);
  }
  glBindTexture(GL_TEXTURE_BUFFER, 0);
  
  glDisableVertexAttribArray(0);
  glDisableVertexAttribArray(1);
  glDisableVertexAttribArray(2);
  
}

void CullingSystem::bitsFromOutput( Job &job, BitType type)
{
  // for GL 3.3 compatibility we use xfb
  // in GL 4.3 SSBO is used
  // 
  // using compute instead of "invisible" point drawing
  // would be better if we had really huge thread counts

  glEnable(GL_RASTERIZER_DISCARD);

  glBindBuffer(GL_ARRAY_BUFFER, job.m_bufferVisOutput.buffer);
  for (int i = 0; i < 8; i++){
    glVertexAttribIPointer(i, 4, GL_UNSIGNED_INT, sizeof(int)*32, (const void*)(i*sizeof(int)*4 + job.m_bufferVisOutput.offset));
    glVertexAttribDivisor(i, 0);
    glEnableVertexAttribArray(i);
  }
  
  if (type == BITS_CURRENT){
    glUseProgram(m_programs.bit_regular);
  }
  else{
    glUseProgram(type == BITS_CURRENT_AND_LAST ? m_programs.bit_temporallast : m_programs.bit_temporalnew);

    glBindBuffer(GL_ARRAY_BUFFER, job.m_bufferVisBitsLast.buffer);
    glVertexAttribIPointer(9, 1, GL_UNSIGNED_INT, sizeof(int), (const void*)job.m_bufferVisBitsLast.offset);
    glEnableVertexAttribArray(9);
  }

  if (m_useSSBO){
    job.m_bufferVisBitsCurrent.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
    glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);
  }
  else{
    job.m_bufferVisBitsCurrent.BindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER,0);
    glBeginTransformFeedback(GL_POINTS);
  }

  glDrawArrays(GL_POINTS,0, minDivide(job.m_numObjects,32));

  if (m_useSSBO){
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, 0);
    glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, 0);
  }
  else{
    glEndTransformFeedback();
    glBindBufferBase(GL_TRANSFORM_FEEDBACK_BUFFER, 0, 0);
  }
  
  glDisableVertexAttribArray(9);
  for (int i = 0; i < 8; i++){
    glDisableVertexAttribArray(i);
  }

  glDisable(GL_RASTERIZER_DISCARD);
  glBindBuffer(GL_ARRAY_BUFFER, 0);
}

void CullingSystem::resultFromBits( Job &job )
{
  job.resultFromBits(job.m_bufferVisBitsCurrent);
}

void CullingSystem::resultClient(Job &job)
{
  job.resultClient();
}

void CullingSystem::buildOutput( MethodType method, Job &job, const View& view )
{
  switch(method){
  case METHOD_FRUSTUM:
    {
      glUseProgram(m_programs.object_frustum);
      glUniformMatrix4fv(m_uniforms.frustum_viewProj, 1 ,GL_FALSE, view.viewProjMatrix);
      
      testBboxes(job,false);
    }
    break;
  case METHOD_HIZ:
    {
      glUseProgram(m_programs.object_hiz);
      glUniformMatrix4fv(m_uniforms.hiz_viewProj, 1, GL_FALSE, view.viewProjMatrix);
      glActiveTexture(GL_TEXTURE2);
      glBindTexture(GL_TEXTURE_2D,job.m_textureDepthWithMipmaps);
      
      testBboxes(job,false);
      
      glActiveTexture(GL_TEXTURE2);
      glBindTexture(GL_TEXTURE_2D,0);
      glActiveTexture(GL_TEXTURE0);
    }
    break;
  case METHOD_RASTER:
    {
      // clear visibles
      job.m_bufferVisOutput.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
      glClearBufferData(GL_SHADER_STORAGE_BUFFER, GL_R32UI,GL_RED_INTEGER,GL_UNSIGNED_INT,0);

      glUseProgram(m_programs.object_raster);
      glUniformMatrix4fv(m_uniforms.raster_viewProj, 1, GL_FALSE, view.viewProjMatrix);
      glUniform3fv(m_uniforms.raster_viewPos, 1, view.viewPos);
      glUniform3fv(m_uniforms.raster_viewDir, 1, view.viewDir);
      
      glEnable( GL_POLYGON_OFFSET_FILL );
      glPolygonOffset(-1,-1);
      testBboxes(job,true);
      glPolygonOffset(0,0);
      glDisable( GL_POLYGON_OFFSET_FILL );

      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

      glBindBufferBase (GL_SHADER_STORAGE_BUFFER,0,0);
    }
    break;
  }
}


void CullingSystem::swapBits( Job &job )
{
  Buffer temp = job.m_bufferVisBitsCurrent;
  job.m_bufferVisBitsCurrent = job.m_bufferVisBitsLast;
  job.m_bufferVisBitsLast = temp;
}


void CullingSystem::JobIndirectUnordered::resultFromBits( const Buffer& bufferVisBitsCurrent )
{
  glEnable(GL_RASTERIZER_DISCARD);

  glUseProgram(m_program_indirect_compact);

  m_bufferIndirectCounter.BindBufferRange(GL_ATOMIC_COUNTER_BUFFER, 0);
  m_bufferIndirectCounter.ClearBufferSubData (GL_ATOMIC_COUNTER_BUFFER, GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, 0);

  bufferVisBitsCurrent.   BindBufferRange(GL_SHADER_STORAGE_BUFFER, 2);
  m_bufferObjectIndirects.BindBufferRange(GL_SHADER_STORAGE_BUFFER, 1);
  m_bufferIndirectResult. BindBufferRange(GL_SHADER_STORAGE_BUFFER, 0);
  m_bufferIndirectResult. ClearBufferSubData(GL_SHADER_STORAGE_BUFFER, GL_R32UI, GL_RED_INTEGER, GL_UNSIGNED_INT, 0);

  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
  glDrawArrays(GL_POINTS,0,m_numObjects);

  glDisable(GL_RASTERIZER_DISCARD);
  glBindBufferBase  (GL_ATOMIC_COUNTER_BUFFER, 0, 0);
  glBindBufferBase  (GL_SHADER_STORAGE_BUFFER, 2, 0);
  glBindBufferBase  (GL_SHADER_STORAGE_BUFFER, 1, 0);
  glBindBufferBase  (GL_SHADER_STORAGE_BUFFER, 0, 0);
}

void CullingSystem::JobReadback::resultFromBits( const Buffer& bufferVisBitsCurrent )
{
  GLsizeiptr size = sizeof(int) * minDivide(m_numObjects,32);
  glBindBuffer(GL_COPY_READ_BUFFER, bufferVisBitsCurrent.buffer );
  glBindBuffer(GL_COPY_WRITE_BUFFER, m_bufferVisBitsReadback.buffer );
  glCopyBufferSubData(GL_COPY_READ_BUFFER, GL_COPY_WRITE_BUFFER, bufferVisBitsCurrent.offset, m_bufferVisBitsReadback.offset, size);
  glBindBuffer( GL_COPY_READ_BUFFER, 0 );
  glBindBuffer( GL_COPY_WRITE_BUFFER, 0 );
}

void CullingSystem::JobReadback::resultClient()
{
  glBindBuffer(GL_COPY_WRITE_BUFFER, m_bufferVisBitsReadback.buffer);
  glGetBufferSubData(GL_COPY_WRITE_BUFFER, m_bufferVisBitsReadback.offset, m_bufferVisBitsReadback.size, m_hostVisBits);
  glBindBuffer( GL_COPY_WRITE_BUFFER, 0);
}

void CullingSystem::JobReadbackPersistent::resultFromBits(const Buffer& bufferVisBitsCurrent)
{
  GLsizeiptr size = sizeof( int ) * minDivide( m_numObjects, 32 );
  glCopyNamedBufferSubData( bufferVisBitsCurrent.buffer, m_bufferVisBitsReadback.buffer, bufferVisBitsCurrent.offset, m_bufferVisBitsReadback.offset, size);
  if (m_fence) {
    glDeleteSync( m_fence );
  }
  m_fence = glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
}

void CullingSystem::JobReadbackPersistent::resultClient()
{
  if (m_fence) {
    GLsizeiptr size = sizeof( int ) * minDivide( m_numObjects, 32 );
    // as some samples read-back within same frame (not recommended) we use the flush here, normally one wouldnt use it
    glClientWaitSync(m_fence, GL_SYNC_FLUSH_COMMANDS_BIT, GL_TIMEOUT_IGNORED);
    glDeleteSync(m_fence);
    m_fence = NULL;
    memcpy( m_hostVisBits, ((uint8_t*)m_bufferVisBitsMapping) + m_bufferVisBitsReadback.offset, size );
  }
}


================================================
FILE: cullingsystem.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#ifndef CULLINGSYSTEM_H__
#define CULLINGSYSTEM_H__

#include <cstddef>
#include <cstdint>
#include <nvgl/extensions_gl.hpp>


class CullingSystem {
public:
  struct Programs {
    GLuint  object_frustum;
    GLuint  object_hiz;
    GLuint  object_raster;

    GLuint  bit_temporallast;
    GLuint  bit_temporalnew;
    GLuint  bit_regular;
    GLuint  depth_mips;
  };

  enum MethodType {
    METHOD_FRUSTUM,
    METHOD_HIZ,
    METHOD_RASTER,
    NUM_METHODS,
  };

  enum BitType {
    BITS_CURRENT,
    BITS_CURRENT_AND_LAST,
    BITS_CURRENT_AND_NOT_LAST,
    NUM_BITS,
  };

  struct Buffer {
    GLuint      buffer;
    GLsizei     stride;
    GLintptr    offset;
    GLsizeiptr  size;

    void create( size_t sizei, const void* data, GLbitfield flags )
    {
      size = sizei;
      offset = 0;
      stride = 0;
      glCreateBuffers( 1, &buffer );
      glNamedBufferStorage( buffer, size, data, flags );
    }

    Buffer( GLuint buffer, size_t sizei = 0 )
      : buffer( buffer )
      , offset( 0 )
      , stride( 0 )
    {
      if (!sizei) {
        if (sizeof( GLsizeiptr ) > 4)
          glGetNamedBufferParameteri64v( buffer, GL_BUFFER_SIZE, (GLint64*)&size );
        else
          glGetNamedBufferParameteriv( buffer, GL_BUFFER_SIZE, (GLint*)&size );
      }
      else {
        size = sizei;
      }
    }

    Buffer()
      : buffer(0)
      , stride(0)
      , offset(0)
      , size(0)
    {

    }

    inline void BindBufferRange(GLenum target, GLuint index) const {
      glBindBufferRange(target, index, buffer, offset, size);
    }
    inline void TexBuffer(GLenum target, GLenum internalformat) const {
      glTexBufferRange(target, internalformat, buffer, offset, size);
    }
    inline void ClearBufferSubData(GLenum target,GLenum internalformat,GLenum format,GLenum type,const GLvoid* data) const {
      glClearBufferSubData(target,internalformat,offset,size,format,type,data);
    }

  };
  
  class Job {
  public:
    int     m_numObjects;
      // world-space matrices {mat4 world, mat4 worldInverseTranspose}
    Buffer  m_bufferMatrices;
    Buffer  m_bufferBboxes; // only used in dualindex mode (2 x vec4)
      // 1 32-bit integer per object (index)
    Buffer  m_bufferObjectMatrix;
      // object-space bounding box (2 x vec4)
      // or 1 32-bit integer per object (dualindex mode)
    Buffer  m_bufferObjectBbox;
    
      // 1 32-bit integer per object
    Buffer  m_bufferVisOutput;
    
      // 1 32-bit integer per 32 objects (1 bit per object)
    Buffer  m_bufferVisBitsCurrent;
    Buffer  m_bufferVisBitsLast;
    
      // for HiZ
    GLuint  m_textureDepthWithMipmaps;

    // derive from this class and implement this function how you want to
    // deal with the results that are provided in the buffer
    virtual void resultFromBits( const Buffer& bufferVisBitsCurrent ) = 0;
    // for readback methods we need to wait for a result
    virtual void resultClient() {};

  };

  class JobReadback : public Job {
  public:
    // 1 32-bit integer per 32 objects (1 bit per object)
    Buffer      m_bufferVisBitsReadback;
    uint32_t*   m_hostVisBits;

    // Do not use this Job class unless you have to. Persistent 
    // mapped buffers are preferred.

    // Copies result into readback buffer
    void resultFromBits( const Buffer& bufferVisBitsCurrent );

    // getBufferData into hostVisBits (blocking!)
    void resultClient();
  };

  class JobReadbackPersistent : public Job {
  public:
    // 1 32-bit integer per 32 objects (1 bit per object)
    Buffer      m_bufferVisBitsReadback;
    void*       m_bufferVisBitsMapping;
    uint32_t*   m_hostVisBits;
    GLsync      m_fence;

    // Copies result into readback buffer and records
    // a fence.
    void resultFromBits(const Buffer& bufferVisBitsCurrent);

    // waits on fence and copies mapping into hostVisBits
    void resultClient();
  };

  // multidrawindirect based
  class JobIndirectUnordered : public Job {
  public:
    GLuint  m_program_indirect_compact;
    // 1 indirectSize per object, 
    Buffer  m_bufferObjectIndirects;
    Buffer  m_bufferIndirectResult;
    // 1 integer
    Buffer  m_bufferIndirectCounter;

    void resultFromBits( const Buffer& bufferVisBitsCurrent );
  };
  
  struct View {
    const float*  viewProjMatrix;
    const float*  viewDir;
    const float*  viewPos;
  };
  
  void init( const Programs &programs, bool dualindex );
  void deinit();
  void update( const Programs &programs, bool dualindex );
  
  // helper function for HiZ method, leaves fbo bound to 0
  void buildDepthMipmaps(GLuint textureDepth, int width, int height);
  
  // assumes relevant fbo bound for raster method
  void buildOutput( MethodType  method, Job &job, const View& view );

  void bitsFromOutput ( Job &job, BitType type );
  void resultFromBits ( Job &job );
  void resultClient   ( Job &job );

  // swaps the Current/Last bit array (for temporal coherent techniques)
  void swapBits       ( Job &job );

private:

  struct Uniforms {
    GLint   depth_lod;
    GLint   depth_even;
    GLint   frustum_viewProj;
    GLint   hiz_viewProj;
    GLint   raster_viewProj;
    GLint   raster_viewDir;
    GLint   raster_viewPos;
  };

  void testBboxes( Job &job, bool raster);
  
  Programs  m_programs;
  Uniforms  m_uniforms;
  GLuint    m_fbo;
  GLuint    m_tbo[2];
  bool      m_dualindex;
  bool      m_useSSBO;
  bool      m_useRepesentativeTest;
};

#endif


================================================
FILE: nodetree.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "nodetree.hpp"
#include <assert.h>

//////////////////////////////////////////////////////////////////////////


static inline void clearNode(NodeTree::Node &node)
{
  node.level      = -1;
  node.leafidx    = NodeTree::INVALID;
  node.levelidx   = NodeTree::INVALID;
  node.parentidx  = NodeTree::INVALID;
  node.childidx   = NodeTree::INVALID;
  node.siblingidx = NodeTree::INVALID;
}

NodeTree::NodeTree()
{
  m_levelsUsed = 0;
  m_treeCompactChangeID = 0;
  m_nodesActive = 0;

  clearNode(m_root);
  m_root.levelidx =  0;
  m_root.level    = -1;
}

const NodeTree::Level* NodeTree::getUsedLevel( int level ) const
{
  if (0 <= level && level < m_levelsUsed){
    return &m_levels[level];
  }
  return nullptr;
}

unsigned int NodeTree::getTreeParentChangeID() const
{
  return m_treeCompactChangeID;
}

const std::vector<NodeTree::compactID>& NodeTree::getTreeCompactNodes() const
{
  return m_treeCompactNodes;
}

NodeTree::nodeID NodeTree::createNode()
{
  nodeID id;

  if (!m_unusedNodes.empty()){
    id = m_unusedNodes[m_unusedNodes.size()-1];
    m_unusedNodes.pop_back();
  }
  else{
    Node node;
    m_nodes.push_back(node);
    m_treeCompactNodes.push_back(compactID());
    id = (nodeID)(m_nodes.size()-1);
  }

  Node&  node = getNode(id);
  clearNode(node);

  return id;
}

void NodeTree::deleteNode( nodeID nodeidx )
{
  assert (isValid(nodeidx) && nodeidx != ROOT);

  const Node &node = getNode(nodeidx);

  // make children unlinked
  while (isValid(node.childidx)){
    setNodeParent(node.childidx,INVALID);
  }

  // remove self from parent list
  setNodeParent(nodeidx,INVALID);

  m_unusedNodes.push_back(nodeidx);
}

void NodeTree::setNodeParent( nodeID nodeidx, nodeID parentidx )
{
  assert (isValid(nodeidx) && nodeidx != ROOT);

  Node &node = getNode(nodeidx);
  if (node.parentidx == parentidx)
    return;

  if (isValid(node.parentidx)){
    // unlink from old
    Node& parent = getNode(node.parentidx);
    bool found = false;
    
    if (parent.childidx == nodeidx){
      parent.childidx = node.siblingidx;
      found = true;
    }
    else if (isValid(parent.childidx)){
      nodeID child = parent.childidx;
      while(isValid(getNode(child).siblingidx)){
        if (getNode(child).siblingidx == nodeidx){
          getNode(child).siblingidx = node.siblingidx;
          found = true;
          break;
        }
        child = getNode(child).siblingidx;
      }
    }

    assert(found && "node was not a child of parent");
    node.siblingidx = INVALID;
    updateLeafNode(node.parentidx);
  }

  if (isValid(parentidx)){
    // link to new
    Node& parent = getNode(parentidx);
    node.siblingidx = parent.childidx;
    parent.childidx = nodeidx;
    updateLeafNode(node.parentidx);
  }

  if (isNodeInTree(nodeidx)){
    updateLevelNode(nodeidx, isNodeInTree(parentidx) ? parentidx : INVALID);
  }

  node.parentidx = parentidx;
}

void NodeTree::addToTree( nodeID nodeidx )
{
  assert (isValid(nodeidx) && nodeidx != ROOT);

  const Node &node = getNode(nodeidx);
  assert (!isNodeInTree(nodeidx)        && "must not be already added to tree");
  assert ( isNodeInTree(node.parentidx) && "parent must be already added to tree");

  updateLevelNode(nodeidx,node.parentidx);
}

void NodeTree::removeFromTree( nodeID nodeidx )
{
  assert (isValid(nodeidx) && nodeidx != ROOT);
  const Node &node = getNode(nodeidx);
  assert (isNodeInTree(nodeidx) && "must be already added to tree");

  updateLevelNode(nodeidx,INVALID);
}

void NodeTree::addToLevel( nodeID nodeidx, nodeID parentidx )
{
  Node&   node        = getNode(nodeidx);
  const Node& parent  = getNode(parentidx);
  Level&  level       = getLevel(parent.level+1);

  level.changeID++;

  node.levelidx = (lvlID)level.nodes.size();
  node.level    = parent.level+1;
  level.nodes.push_back(nodeidx);

  if (!isValid(node.childidx)){
    addLeafNode(nodeidx);
  }

  m_levelsUsed = node.level+1 > m_levelsUsed ? node.level+1 : m_levelsUsed;

  m_nodesActive++;
}

void NodeTree::removeFromLevel( nodeID nodeidx )
{
  Node&   node  = getNode(nodeidx);
  Level&  level = getLevel(node.level);

  level.changeID++;

  level.nodes[node.levelidx] = level.nodes[level.nodes.size()-1];
  getNode(level.nodes[node.levelidx]).levelidx = node.levelidx;
  level.nodes.pop_back();

  if (isValid(node.leafidx)){
    removeLeafNode(nodeidx);
  }

  if (node.level+1 == m_levelsUsed && level.nodes.empty()){
    m_levelsUsed--;
  }

  node.level    = -1;
  node.levelidx = INVALID;
  node.leafidx  = INVALID;

  m_nodesActive--;
}

void NodeTree::removeLeafNode( nodeID nodeidx )
{
  assert(isNodeInTree(nodeidx));
  Node& node    = getNode(nodeidx);
  Level& level  = getLevel(node.level);
  // remove
  level.leaves[node.leafidx] = level.leaves[level.leaves.size()-1];
  getNode(level.leaves[node.leafidx]).leafidx = node.leafidx;
  level.leaves.pop_back();
}

void NodeTree::addLeafNode( nodeID nodeidx )
{
  assert(isNodeInTree(nodeidx));
  Node& node    = getNode(nodeidx);
  Level& level  = getLevel(node.level);
  // add
  node.leafidx = (lvlID)level.leaves.size();
  level.leaves.push_back(nodeidx);
}

void NodeTree::updateLeafNode( nodeID nodeidx )
{
  if (!isNodeInTree(nodeidx))
    return;

  Node& node    = getNode(nodeidx);
  if (!isValid(node.childidx) && isValid(node.leafidx)){
    removeLeafNode(nodeidx);
  }
  else if (isValid(node.childidx) && !isValid(node.leafidx)){
    addLeafNode(nodeidx);
  }
}

void NodeTree::updateLevelNode( nodeID nodeidx, nodeID parentidx )
{
  // at this point node.parentidx is still the old value
  Node &node = getNode(nodeidx);

  // update level parent buffer to reflect last state always
  m_treeCompactNodes[nodeidx].parent = parentidx;
  m_treeCompactChangeID++;

  if (isValid(node.levelidx)){
    // already active
    if (isValid(parentidx)){
      const Node& parent = getNode(parentidx);
      int oldlevel = node.level;
      int newlevel = parent.level + 1;

      // we remain in the same level and only our parent has changed
      if (oldlevel == newlevel){
        return;
      }

      removeFromLevel(nodeidx);
      addToLevel(nodeidx,parentidx);
    }
    else{
      removeFromLevel(nodeidx);
    }
  }
  else if (isValid(parentidx)){
    // was inactive 
    // add to level
    addToLevel(nodeidx,parentidx);
  }

  m_treeCompactNodes[nodeidx].level  = node.level;

  nodeID child = node.childidx;
  while (isValid(child)){
    updateLevelNode(child, isValid(parentidx) ? nodeidx : INVALID );
    child = getNode(child).siblingidx;
  }
}

void NodeTree::reserve( int numNodes )
{
  m_nodes.reserve( numNodes );
  m_treeCompactNodes.reserve( numNodes );
}

void NodeTree::create( int numNodes )
{
  Node node;
  clearNode(node);

  m_nodes.resize( numNodes, node );
  m_treeCompactNodes.resize( numNodes, compactID() );
}

void NodeTree::clear()
{
  m_nodesActive = 0;
  m_levelsUsed  = 0;
  m_treeCompactChangeID = 0;
  m_levels.clear();
  m_nodes.clear();
  m_treeCompactNodes.clear();
}


================================================
FILE: nodetree.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#pragma once

#include <vector>

class NodeTree {
public:
  enum Flags {
    INVALID = 0xFFFFFFFF,
    ROOT = 0x7FFFFFFF,
    LEVELBITS = 8,
    PARENTBITS = 32 - LEVELBITS
  };

  static constexpr unsigned INVALID_LEVEL = (1 << LEVELBITS) - 1;
  static constexpr unsigned INVALID_PARENT = (1 << PARENTBITS) - 1;

  struct compactID {
    unsigned level : LEVELBITS;
    unsigned parent : PARENTBITS;

    compactID(){
      level = INVALID_LEVEL;
      parent = INVALID_PARENT;
    }
  };
  typedef unsigned int nodeID;
  typedef unsigned int lvlID;


  struct Level {
    unsigned int          changeID;
    std::vector<nodeID>   nodes;
    std::vector<nodeID>   leaves;

    Level(){
      changeID = 0;
    }
  };

  struct Node {
    nodeID                parentidx;
    lvlID                 levelidx;
    lvlID                 leafidx;
    int                   level;
    nodeID                childidx;
    nodeID                siblingidx;
  };

private:

  Node                              m_root;

  // general nodes
  std::vector<Node>                 m_nodes;
  std::vector<nodeID>               m_unusedNodes;

  // actual nodes added to tree
  std::vector<compactID>            m_treeCompactNodes;
  std::vector<Level>                m_levels;
  unsigned int                      m_treeCompactChangeID;
  int                               m_nodesActive;
  int                               m_levelsUsed;

public:
  NodeTree();

  const Level*  getUsedLevel(int level) const;
  inline int getNumUsedLevel() const 
  {
    return m_levelsUsed;
  }

  unsigned int getTreeParentChangeID() const;
  const std::vector<compactID>& getTreeCompactNodes() const;

  inline nodeID getTreeRoot()
  {
    return ROOT;
  }

  inline const Node& getNode(nodeID nodeidx) const
  {
    if (nodeidx == ROOT) return m_root;
    else                 return m_nodes[nodeidx];
  }

  inline bool  isValid(unsigned int id)
  {
    return id != INVALID;
  }

  inline bool  isNodeInTree(nodeID nodeidx)
  {
    return isValid(nodeidx) && isValid(getNode(nodeidx).levelidx);
  }

  inline nodeID  getParentNode(nodeID nodeidx) const
  {
    return getNode(nodeidx).parentidx;
  }

  nodeID  createNode();

  void    deleteNode(nodeID nodeidx);

  void    setNodeParent(nodeID nodeidx, nodeID parentidx);

  void    addToTree(nodeID nodeidx);

  void    removeFromTree(nodeID nodeidx);

  void    reserve(int numNodes);

  void    create(int numNodes);

  void    clear();

  int     getNumActiveNodes() const {
    return m_nodesActive;
  }

private:

  inline Level& getLevel(int level)
  {
    if ((int)m_levels.size() < level+1){
      m_levels.resize(level+1);
    }
    return m_levels[level];
  }

  inline Node& getNode(nodeID nodeidx)
  {
    if (nodeidx == ROOT) return m_root;
    else                 return m_nodes[nodeidx];
  }

  void addToLevel(nodeID nodeidx, nodeID parentidx);

  void removeFromLevel(nodeID nodeidx);

  void removeLeafNode(nodeID nodeidx);

  void addLeafNode(nodeID nodeidx);

  void updateLeafNode(nodeID nodeidx);

  void updateLevelNode(nodeID nodeidx, nodeID parentidx);

};


================================================
FILE: nvtoken.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "nvtoken.hpp"

namespace nvtoken
{

  //////////////////////////////////////////////////////////////////////////
  // generic

  GLuint   s_nvcmdlist_header[NVTOKEN_TYPES] = {0};
  GLuint   s_nvcmdlist_headerSizes[NVTOKEN_TYPES] = {0};
  GLushort s_nvcmdlist_stages[NVTOKEN_STAGES] = {0};
  bool     s_nvcmdlist_bindless  = false;
  
  static inline GLuint nvtokenHeaderSW(GLuint type, GLuint size){
    return type | (size<<16);
  }
  
  static inline GLenum nvtokenHeaderCommandSW(GLuint header)
  {
    return header & 0xFFFF;
  }

  static inline GLuint nvtokenHeaderSizeSW(GLuint header)
  {
    return header>>16;
  }

  static inline GLenum nvtokenHeaderCommand(GLuint header)
  {
    for (int i = 0; i < NVTOKEN_TYPES; i++){
      if (header == s_nvcmdlist_header[i]) return i;
    }

    assert(0 && "can't find header");
    return -1;
  }

  template <class T>
  static void nvtokenRegisterSize()
  {
    s_nvcmdlist_headerSizes[T::ID] = sizeof(T);
  }

  void nvtokenInitInternals( bool hwsupport, bool bindlessSupport)
  {
    assert( !hwsupport || (hwsupport && bindlessSupport) );

    nvtokenRegisterSize<NVTokenTerminate>();
    nvtokenRegisterSize<NVTokenNop>();
    nvtokenRegisterSize<NVTokenDrawElems>();
    nvtokenRegisterSize<NVTokenDrawArrays>();
    nvtokenRegisterSize<NVTokenDrawElemsStrip>();
    nvtokenRegisterSize<NVTokenDrawArraysStrip>();
    nvtokenRegisterSize<NVTokenDrawElemsInstanced>();
    nvtokenRegisterSize<NVTokenDrawArraysInstanced>();
    nvtokenRegisterSize<NVTokenVbo>();
    nvtokenRegisterSize<NVTokenIbo>();
    nvtokenRegisterSize<NVTokenUbo>();
    nvtokenRegisterSize<NVTokenLineWidth>();
    nvtokenRegisterSize<NVTokenPolygonOffset>();
    nvtokenRegisterSize<NVTokenScissor>();
    nvtokenRegisterSize<NVTokenBlendColor>();
    nvtokenRegisterSize<NVTokenViewport>();
    nvtokenRegisterSize<NVTokenAlphaRef>();
    nvtokenRegisterSize<NVTokenStencilRef>();
    nvtokenRegisterSize<NVTokenFrontFace>();
    
    for (int i = 0; i < NVTOKEN_TYPES; i++){
      GLuint sz = s_nvcmdlist_headerSizes[i];
      assert(sz);
    }
    
    s_nvcmdlist_bindless  = bindlessSupport;
    
    if (hwsupport){
      for (int i = 0; i < NVTOKEN_TYPES; i++){
        s_nvcmdlist_header[i] = glGetCommandHeaderNV(i,s_nvcmdlist_headerSizes[i]);
      }
      s_nvcmdlist_stages[NVTOKEN_STAGE_VERTEX] = glGetStageIndexNV(GL_VERTEX_SHADER);
      s_nvcmdlist_stages[NVTOKEN_STAGE_TESS_CONTROL] = glGetStageIndexNV(GL_TESS_CONTROL_SHADER);
      s_nvcmdlist_stages[NVTOKEN_STAGE_TESS_EVALUATION] = glGetStageIndexNV(GL_TESS_EVALUATION_SHADER);
      s_nvcmdlist_stages[NVTOKEN_STAGE_GEOMETRY] = glGetStageIndexNV(GL_GEOMETRY_SHADER);
      s_nvcmdlist_stages[NVTOKEN_STAGE_FRAGMENT] = glGetStageIndexNV(GL_FRAGMENT_SHADER);
    }
    else{
      for (int i = 0; i < NVTOKEN_TYPES; i++){
        s_nvcmdlist_header[i] = nvtokenHeaderSW(i,s_nvcmdlist_headerSizes[i]);
      }
      for (int i = 0; i < NVTOKEN_STAGES; i++){
        s_nvcmdlist_stages[i] = i;
      }
    }
  }

#define TOSTRING(a)  case a: return #a;
  const char* nvtokenCommandToString(GLenum type){
    switch  (type){
      TOSTRING(GL_NOP_COMMAND_NV                   );
      TOSTRING(GL_DRAW_ELEMENTS_INSTANCED_COMMAND_NV);
      TOSTRING(GL_DRAW_ARRAYS_INSTANCED_COMMAND_NV  );
      TOSTRING(GL_ELEMENT_ADDRESS_COMMAND_NV       );
      TOSTRING(GL_ATTRIBUTE_ADDRESS_COMMAND_NV     );
      TOSTRING(GL_UNIFORM_ADDRESS_COMMAND_NV       );
      TOSTRING(GL_BLEND_COLOR_COMMAND_NV           );
      TOSTRING(GL_STENCIL_REF_COMMAND_NV           );
      TOSTRING(GL_TERMINATE_SEQUENCE_COMMAND_NV    );
      TOSTRING(GL_LINE_WIDTH_COMMAND_NV            );
      TOSTRING(GL_POLYGON_OFFSET_COMMAND_NV        );
      TOSTRING(GL_ALPHA_REF_COMMAND_NV             );
      TOSTRING(GL_VIEWPORT_COMMAND_NV              );
      TOSTRING(GL_SCISSOR_COMMAND_NV               );
      TOSTRING(GL_DRAW_ELEMENTS_COMMAND_NV         );
      TOSTRING(GL_DRAW_ARRAYS_COMMAND_NV           );
      TOSTRING(GL_DRAW_ELEMENTS_STRIP_COMMAND_NV   );
      TOSTRING(GL_DRAW_ARRAYS_STRIP_COMMAND_NV     );
    }
    return NULL;
  }

  //////////////////////////////////////////////////////////////////////////


  void nvtokenGetStats( const void* NV_RESTRICT stream, size_t streamSize, int stats[NVTOKEN_TYPES] )
  {
    const GLubyte* NV_RESTRICT current = (GLubyte*)stream;
    const GLubyte* streamEnd = current + streamSize;

    while (current < streamEnd){
      const GLuint*             header  = (const GLuint*)current;

      GLenum type = nvtokenHeaderCommand(*header);
      stats[type]++;

      current += s_nvcmdlist_headerSizes[type];
    }
  }


  // Emulation related

  static inline GLenum nvtokenDrawCommandSequenceSW( const void* NV_RESTRICT stream, size_t streamSize, GLenum mode, GLenum type, const StateSystem::State& state )
  {
    const GLubyte* NV_RESTRICT current = (GLubyte*)stream;
    const GLubyte* streamEnd = current + streamSize;

    GLenum modeStrip;
    if      (mode == GL_LINES)                modeStrip = GL_LINE_STRIP;
    else if (mode == GL_TRIANGLES)            modeStrip = GL_TRIANGLE_STRIP;
    /*else if (mode == GL_QUADS)                modeStrip = GL_QUAD_STRIP;*/
    else if (mode == GL_LINES_ADJACENCY)      modeStrip = GL_LINE_STRIP_ADJACENCY;
    else if (mode == GL_TRIANGLES_ADJACENCY)  modeStrip = GL_TRIANGLE_STRIP_ADJACENCY;
    else    modeStrip = mode;

    GLenum modeSpecial;
    if      (mode == GL_LINES)      modeSpecial = GL_LINE_LOOP;
    else if (mode == GL_TRIANGLES)  modeSpecial = GL_TRIANGLE_FAN;
    else    modeSpecial = mode;

    while (current < streamEnd){
      const GLuint*             header  = (const GLuint*)current;

      GLenum cmdtype = nvtokenHeaderCommand(*header);
      // if you always use emulation on non-native tokens you can use 
      // cmdtype = nvtokenHeaderCommandSW(header->encoded)
      switch(cmdtype){
      case GL_TERMINATE_SEQUENCE_COMMAND_NV:
        {
          return type;
        }
        break;
      case GL_NOP_COMMAND_NV:
        {
        }
        break;
      case GL_DRAW_ELEMENTS_COMMAND_NV:
        {
          const DrawElementsCommandNV* cmd = (const DrawElementsCommandNV*)current;
          glDrawElementsBaseVertex(mode, cmd->count, type, (const GLvoid*)(cmd->firstIndex * sizeof(GLuint)), cmd->baseVertex);
        }
        break;
      case GL_DRAW_ARRAYS_COMMAND_NV:
        {
          const DrawArraysCommandNV* cmd = (const DrawArraysCommandNV*)current;
          glDrawArrays(mode, cmd->first, cmd->count);
        }
        break;
      case GL_DRAW_ELEMENTS_STRIP_COMMAND_NV:
        {
          const DrawElementsCommandNV* cmd = (const DrawElementsCommandNV*)current;
          glDrawElementsBaseVertex(modeStrip, cmd->count, type, (const GLvoid*)(cmd->firstIndex * sizeof(GLuint)), cmd->baseVertex);
        }
        break;
      case GL_DRAW_ARRAYS_STRIP_COMMAND_NV:
        {
          const DrawArraysCommandNV* cmd = (const DrawArraysCommandNV*)current;
          glDrawArrays(modeStrip, cmd->first, cmd->count);
        }
        break;
      case GL_DRAW_ELEMENTS_INSTANCED_COMMAND_NV:
        {
          const DrawElementsInstancedCommandNV* cmd = (const DrawElementsInstancedCommandNV*)current;

          assert (cmd->mode == mode || cmd->mode == modeStrip || cmd->mode == modeSpecial);

          glDrawElementsIndirect(cmd->mode, type, &cmd->count);
        }
        break;
      case GL_DRAW_ARRAYS_INSTANCED_COMMAND_NV:
        {
          const DrawArraysInstancedCommandNV* cmd = (const DrawArraysInstancedCommandNV*)current;

          assert (cmd->mode == mode || cmd->mode == modeStrip || cmd->mode == modeSpecial);

          glDrawArraysIndirect(cmd->mode, &cmd->count);
        }
        break;
      case GL_ELEMENT_ADDRESS_COMMAND_NV:
        {
          const ElementAddressCommandNV* cmd = (const ElementAddressCommandNV*)current;
          type = cmd->typeSizeInByte == 4 ? GL_UNSIGNED_INT : GL_UNSIGNED_SHORT;
          if (s_nvcmdlist_bindless){
            glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, GLuint64(cmd->addressLo) | (GLuint64(cmd->addressHi)<<32), 0x7FFFFFFF);
          }
          else{
            const ElementAddressCommandEMU* cmd = (const ElementAddressCommandEMU*)current;
            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, cmd->buffer);
          }
        }
        break;
      case GL_ATTRIBUTE_ADDRESS_COMMAND_NV:
        {
          if (s_nvcmdlist_bindless){
            const AttributeAddressCommandNV* cmd = (const AttributeAddressCommandNV*)current;
            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, cmd->index, GLuint64(cmd->addressLo) | (GLuint64(cmd->addressHi)<<32), 0x7FFFFFFF);
          }
          else{
            const AttributeAddressCommandEMU* cmd = (const AttributeAddressCommandEMU*)current;
            glBindVertexBuffer(cmd->index, cmd->buffer, cmd->offset, state.vertexformat.bindings[cmd->index].stride);
          }
        }
        break;
      case GL_UNIFORM_ADDRESS_COMMAND_NV:
        {
           if (s_nvcmdlist_bindless){
            const UniformAddressCommandNV* cmd = (const UniformAddressCommandNV*)current;
            glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV, cmd->index, GLuint64(cmd->addressLo) | (GLuint64(cmd->addressHi)<<32), 0x10000);
          }
          else{
            const UniformAddressCommandEMU* cmd = (const UniformAddressCommandEMU*)current;
            glBindBufferRange(GL_UNIFORM_BUFFER,cmd->index, cmd->buffer, cmd->offset256 * 256, cmd->size4*4);
          }
        }
        break;
      case GL_BLEND_COLOR_COMMAND_NV:
        {
          const BlendColorCommandNV* cmd = (const BlendColorCommandNV*)current;
          glBlendColor(cmd->red,cmd->green,cmd->blue,cmd->alpha);
        }
        break;
      case GL_STENCIL_REF_COMMAND_NV:
        {
          const StencilRefCommandNV* cmd = (const StencilRefCommandNV*)current;
          glStencilFuncSeparate(GL_FRONT, state.stencil.funcs[StateSystem::FACE_FRONT].func, cmd->frontStencilRef, state.stencil.funcs[StateSystem::FACE_FRONT].mask);
          glStencilFuncSeparate(GL_BACK,  state.stencil.funcs[StateSystem::FACE_BACK ].func, cmd->backStencilRef,  state.stencil.funcs[StateSystem::FACE_BACK ].mask);
        }
        break;

      case GL_LINE_WIDTH_COMMAND_NV:
        {
          const LineWidthCommandNV* cmd = (const LineWidthCommandNV*)current;
          glLineWidth(cmd->lineWidth);
        }
        break;
      case GL_POLYGON_OFFSET_COMMAND_NV:
        {
          const PolygonOffsetCommandNV* cmd = (const PolygonOffsetCommandNV*)current;
          glPolygonOffset(cmd->scale,cmd->bias);
        }
        break;
      case GL_ALPHA_REF_COMMAND_NV:
        {/*
          const AlphaRefCommandNV* cmd = (const AlphaRefCommandNV*)current;
          glAlphaFunc(state.alpha.mode, cmd->alphaRef);
          */
        }
        break;
      case GL_VIEWPORT_COMMAND_NV:
        {
          const ViewportCommandNV* cmd = (const ViewportCommandNV*)current;
          glViewport(cmd->x, cmd->y, cmd->width, cmd->height);
        }
        break;
      case GL_SCISSOR_COMMAND_NV:
        {
          const ScissorCommandNV* cmd = (const ScissorCommandNV*)current;
          glScissor(cmd->x,cmd->y,cmd->width,cmd->height);
        }
        break;
      case GL_FRONT_FACE_COMMAND_NV:
        {
          FrontFaceCommandNV* cmd = (FrontFaceCommandNV*)current;
          glFrontFace(cmd->frontFace?GL_CW:GL_CCW);
        }
        break;
      }


      GLuint tokenSize = s_nvcmdlist_headerSizes[cmdtype];
      assert(tokenSize);

      current += tokenSize;

    }
    return type;
  }

  void nvtokenDrawCommandsSW(GLenum mode, const void* NV_RESTRICT stream, size_t streamSize, 
    const GLintptr* NV_RESTRICT offsets, const GLsizei* NV_RESTRICT sizes, 
    GLuint count, 
    StateSystem::State &state)
  {
    const char* NV_RESTRICT tokens = (const char*)stream;
    GLenum type = GL_UNSIGNED_SHORT;
    for (GLuint i = 0; i < count; i++)
    {
      size_t offset = offsets[i];
      size_t size   = sizes[i];

      assert(size + offset <= streamSize);

      type = nvtokenDrawCommandSequenceSW(&tokens[offset], size, mode, type, state);
    }

  }

#if NVTOKEN_STATESYSTEM
  void nvtokenDrawCommandsStatesSW(const void* NV_RESTRICT stream, size_t streamSize, 
    const GLintptr* NV_RESTRICT offsets, const GLsizei* NV_RESTRICT sizes, 
    const GLuint* NV_RESTRICT states, const GLuint* NV_RESTRICT fbos, GLuint count, 
    StateSystem &stateSystem)
  {
    int lastFbo = ~0;
    const char* NV_RESTRICT tokens = (const char*)stream;

    StateSystem::StateID lastID;

    GLenum type = GL_UNSIGNED_SHORT;
    for (GLuint i = 0; i < count; i++)
    {
      GLuint fbo;

      StateSystem::StateID curID = states[i];
      const StateSystem::State&  state = stateSystem.get(curID);

      if (fbos[i]){
        fbo = fbos[i];
      }
      else{
        fbo = state.fbo.fboDraw;
      }

      if (fbo != lastFbo){
        glBindFramebuffer(GL_FRAMEBUFFER, fbo);
        lastFbo = fbo;
      }

      if (i == 0){
        stateSystem.applyGL( curID, true ); // quite costly
      }
      else {
        stateSystem.applyGL( curID, lastID, true );
      }
      lastID = curID;

      size_t offset = offsets[i];
      size_t size   = sizes[i];

      GLenum mode = state.basePrimitiveMode;

      assert(size + offset <= streamSize);

      type = nvtokenDrawCommandSequenceSW(&tokens[offset], size, mode, type, state);
    }
  }
#endif
}


================================================
FILE: nvtoken.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */


#include <assert.h>
#include <string>
#include <vector>

#define NVTOKEN_STATESYSTEM 1

#include "platform.h"
#include <nvgl/extensions_gl.hpp>
#if NVTOKEN_STATESYSTEM
// not needed if emulation is not used, or implemented differently
#include "statesystem.hpp"
#else
namespace StateSystem {
  // Minimal emulation layer
  enum Faces {
    FACE_FRONT,
    FACE_BACK,
    MAX_FACES,
  };
  struct State {
    struct {
      struct {
        GLsizei stride;
      }bindings[16];
    }vertexformat;

    struct {
      GLenum mode;
    }alpha;

    struct {
      struct {
        GLenum func;
        GLuint mask;
      }funcs[MAX_FACES];
    }stencil;
  };
}
#endif


namespace nvtoken
{

  //////////////////////////////////////////////////////////////////////////
  // generic

  // not the cleanest way
  #define NVTOKEN_TYPES (GL_FRONT_FACE_COMMAND_NV+1)

  enum NVTokenShaderStage {
    NVTOKEN_STAGE_VERTEX,
    NVTOKEN_STAGE_TESS_CONTROL,
    NVTOKEN_STAGE_TESS_EVALUATION,
    NVTOKEN_STAGE_GEOMETRY,
    NVTOKEN_STAGE_FRAGMENT,
    NVTOKEN_STAGES,
  };

  extern bool     s_nvcmdlist_bindless;
  extern GLuint   s_nvcmdlist_header[NVTOKEN_TYPES];
  extern GLuint   s_nvcmdlist_headerSizes[NVTOKEN_TYPES];
  extern GLushort s_nvcmdlist_stages[NVTOKEN_STAGES];
  
  class NVPointerStream {
  public:
    size_t          m_max;
    unsigned char*  m_begin;
    unsigned char*  m_end;
    unsigned char* NV_RESTRICT m_cur;

    void init(void* data, size_t size)
    {
      m_begin = (unsigned char*)data;
      m_end   = m_begin + size;
      m_cur   = m_begin;
      m_max   = size;
    }

    size_t size() const
    {
      return m_cur - m_begin;
    }

    size_t  capacity() const
    {
      return m_max;
    }
  };

  struct NVTokenSequence {
    std::vector<GLintptr>  offsets;
    std::vector<GLsizei>   sizes;
    std::vector<GLuint>    states;
    std::vector<GLuint>    fbos;
  };

#pragma pack(push,1)

  typedef struct {
    GLuint   header;
    GLuint   buffer;
    GLuint   _pad;
    GLuint   typeSizeInByte;
  } ElementAddressCommandEMU;

  typedef struct {
    GLuint   header;
    GLuint   index;
    GLuint   buffer;
    GLuint   offset;
  } AttributeAddressCommandEMU;

  typedef struct {
    GLuint      header;
    GLushort    index;
    GLushort    stage;
    GLuint      buffer;
    GLushort    offset256;
    GLushort    size4;
  } UniformAddressCommandEMU;


  struct NVTokenNop {
    static const GLenum   ID = GL_NOP_COMMAND_NV;

    NOPCommandNV      cmd;

    NVTokenNop() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenTerminate {
    static const GLenum   ID = GL_TERMINATE_SEQUENCE_COMMAND_NV;

    TerminateSequenceCommandNV      cmd;

    NVTokenTerminate() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenDrawElemsInstanced {
    static const GLenum   ID = GL_DRAW_ELEMENTS_INSTANCED_COMMAND_NV;

    DrawElementsInstancedCommandNV   cmd;

    NVTokenDrawElemsInstanced() {
      cmd.mode = GL_TRIANGLES;
      cmd.baseInstance = 0;
      cmd.baseVertex = 0;
      cmd.firstIndex = 0;
      cmd.count = 0;
      cmd.instanceCount = 1;

      cmd.header  = s_nvcmdlist_header[ID];
    }
    
    void setMode(GLenum primmode) {
      cmd.mode = primmode;
    }

    void setParams(GLuint count, GLuint firstIndex=0, GLuint baseVertex=0)
    {
      cmd.count = count;
      cmd.firstIndex = firstIndex;
      cmd.baseVertex = baseVertex;
    }

    void setInstances(GLuint count, GLuint baseInstance=0){
      cmd.baseInstance  = baseInstance;
      cmd.instanceCount = count;
    }
  };

  struct NVTokenDrawArraysInstanced {
    static const GLenum   ID = GL_DRAW_ARRAYS_INSTANCED_COMMAND_NV;

    DrawArraysInstancedCommandNV          cmd;

    NVTokenDrawArraysInstanced() {
      cmd.mode = GL_TRIANGLES;
      cmd.baseInstance = 0;
      cmd.first = 0;
      cmd.count = 0;
      cmd.instanceCount = 1;

      cmd.header  = s_nvcmdlist_header[ID];
    }
    
    void setMode(GLenum primmode) {
      cmd.mode = primmode;
    }

    void setParams(GLuint count, GLuint first=0)
    {
      cmd.count = count;
      cmd.first = first;
    }

    void setInstances(GLuint count, GLuint baseInstance=0){
      cmd.baseInstance  = baseInstance;
      cmd.instanceCount = count;
    }
  };

  struct NVTokenDrawElems {
    static const GLenum   ID = GL_DRAW_ELEMENTS_COMMAND_NV;

    DrawElementsCommandNV   cmd;

    NVTokenDrawElems() {
      cmd.baseVertex = 0;
      cmd.firstIndex = 0;
      cmd.count = 0;

      cmd.header  = s_nvcmdlist_header[ID];
    }

    void setParams(GLuint count, GLuint firstIndex=0, GLuint baseVertex=0)
    {
      cmd.count = count;
      cmd.firstIndex = firstIndex;
      cmd.baseVertex = baseVertex;
    }
    
    void setMode(GLenum primmode) {
      assert(primmode != GL_TRIANGLE_FAN && /* primmode != GL_POLYGON && */ primmode != GL_LINE_LOOP);
      
      if (primmode == GL_LINE_STRIP || primmode == GL_TRIANGLE_STRIP || /* primmode == GL_QUAD_STRIP || */
          primmode == GL_LINE_STRIP_ADJACENCY || primmode == GL_TRIANGLE_STRIP_ADJACENCY)
      {
        cmd.header = s_nvcmdlist_header[GL_DRAW_ELEMENTS_STRIP_COMMAND_NV];
      }
      else
      {
        cmd.header = s_nvcmdlist_header[GL_DRAW_ELEMENTS_COMMAND_NV];
      }
    }
  };

  struct NVTokenDrawArrays {
    static const GLenum   ID = GL_DRAW_ARRAYS_COMMAND_NV;

    DrawArraysCommandNV   cmd;

    NVTokenDrawArrays() {
      cmd.first = 0;
      cmd.count = 0;

      cmd.header  = s_nvcmdlist_header[ID];
    }

    void setParams(GLuint count, GLuint first=0)
    {
      cmd.count = count;
      cmd.first = first;
    }
    
    void setMode(GLenum primmode) {
      assert(primmode != GL_TRIANGLE_FAN && /* primmode != GL_POLYGON && */ primmode != GL_LINE_LOOP);
      
      if (primmode == GL_LINE_STRIP || primmode == GL_TRIANGLE_STRIP || /* primmode == GL_QUAD_STRIP || */
          primmode == GL_LINE_STRIP_ADJACENCY || primmode == GL_TRIANGLE_STRIP_ADJACENCY)
      {
        cmd.header = s_nvcmdlist_header[GL_DRAW_ARRAYS_STRIP_COMMAND_NV];
      }
      else
      {
        cmd.header = s_nvcmdlist_header[GL_DRAW_ARRAYS_COMMAND_NV];
      }
    }
  };

  struct NVTokenDrawElemsStrip {
    static const GLenum   ID = GL_DRAW_ELEMENTS_STRIP_COMMAND_NV;

    DrawElementsCommandNV   cmd;

    NVTokenDrawElemsStrip() {
      cmd.baseVertex = 0;
      cmd.firstIndex = 0;
      cmd.count = 0;

      cmd.header  = s_nvcmdlist_header[ID];
    }

    void setParams(GLuint count, GLuint firstIndex=0, GLuint baseVertex=0)
    {
      cmd.count = count;
      cmd.firstIndex = firstIndex;
      cmd.baseVertex = baseVertex;
    }
  };

  struct NVTokenDrawArraysStrip {
    static const GLenum   ID = GL_DRAW_ARRAYS_STRIP_COMMAND_NV;

    DrawArraysCommandNV   cmd;

    NVTokenDrawArraysStrip() {
      cmd.first = 0;
      cmd.count = 0;

      cmd.header  = s_nvcmdlist_header[ID];
    }

    void setParams(GLuint count, GLuint first=0)
    {
      cmd.count = count;
      cmd.first = first;
    }
  };

  struct NVTokenVbo {
    static const GLenum   ID = GL_ATTRIBUTE_ADDRESS_COMMAND_NV;

    union {
      AttributeAddressCommandNV   cmd;
      AttributeAddressCommandEMU  cmdEMU;
    };

    void setBinding(GLuint idx){
      cmd.index = idx;
    }

    void setBuffer(GLuint buffer, GLuint64 address, GLuint offset)
    {
      if (s_nvcmdlist_bindless){
        address += offset;
        cmd.addressLo = GLuint(address & 0xFFFFFFFF);
        cmd.addressHi = GLuint(address >> 32);
      }
      else{
        cmdEMU.buffer = buffer;
        cmdEMU.offset = offset;
      }
    }

    NVTokenVbo() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenIbo {
    static const GLenum   ID = GL_ELEMENT_ADDRESS_COMMAND_NV;

    union{
      ElementAddressCommandNV     cmd;
      ElementAddressCommandEMU    cmdEMU;
    };

    void setType(GLenum type){
      if (type == GL_UNSIGNED_BYTE){
        cmd.typeSizeInByte = 1;
      }
      else if (type == GL_UNSIGNED_SHORT){
        cmd.typeSizeInByte = 2;
      }
      else if (type == GL_UNSIGNED_INT){
        cmd.typeSizeInByte = 4;
      }
      else{
        assert(0 && "illegal type");
      }
    }

    void setBuffer(GLuint buffer, GLuint64 address)
    {
      if (s_nvcmdlist_bindless){
        cmd.addressLo = GLuint(address & 0xFFFFFFFF);
        cmd.addressHi = GLuint(address >> 32);
      }
      else{
        cmdEMU.buffer = buffer;
        cmdEMU._pad   = 0;
      }
    }
    
    NVTokenIbo() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenUbo {
    static const GLenum   ID = GL_UNIFORM_ADDRESS_COMMAND_NV;

    union{
      UniformAddressCommandNV   cmd;
      UniformAddressCommandEMU  cmdEMU;
    };

    void setBuffer(GLuint buffer, GLuint64 address, GLuint offset, GLuint size)
    {
      assert(size % 4 == 0 && offset % 256 == 0);
      if (s_nvcmdlist_bindless){
        address += offset;
        cmd.addressLo = GLuint(address & 0xFFFFFFFF);
        cmd.addressHi = GLuint(address >> 32);
      }
      else{
        cmdEMU.buffer = buffer;
        cmdEMU.offset256 = offset / 256;
        cmdEMU.size4     = size / 4;
      }
    }

    void setBinding(GLuint idx, NVTokenShaderStage stage){
      cmd.index = idx;
      cmd.stage = s_nvcmdlist_stages[stage];
    }
    
    NVTokenUbo() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenBlendColor{
    static const GLenum   ID = GL_BLEND_COLOR_COMMAND_NV;

    BlendColorCommandNV     cmd;

    NVTokenBlendColor() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenStencilRef{
    static const GLenum   ID = GL_STENCIL_REF_COMMAND_NV;

    StencilRefCommandNV cmd;

    NVTokenStencilRef() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  } ;

  struct NVTokenLineWidth{
    static const GLenum   ID = GL_LINE_WIDTH_COMMAND_NV;

    LineWidthCommandNV  cmd;

    NVTokenLineWidth() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenPolygonOffset{
    static const GLenum   ID = GL_POLYGON_OFFSET_COMMAND_NV;

    PolygonOffsetCommandNV  cmd;

    NVTokenPolygonOffset() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenAlphaRef{
    static const GLenum   ID = GL_ALPHA_REF_COMMAND_NV;

    AlphaRefCommandNV cmd;

    NVTokenAlphaRef() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenViewport{
    static const GLenum   ID = GL_VIEWPORT_COMMAND_NV;

    ViewportCommandNV cmd;

    NVTokenViewport() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenScissor {
    static const GLenum   ID = GL_SCISSOR_COMMAND_NV;

    ScissorCommandNV  cmd;

    NVTokenScissor() {
      cmd.header  = s_nvcmdlist_header[ID];
    }
  };

  struct NVTokenFrontFace {
    static const GLenum   ID = GL_FRONT_FACE_COMMAND_NV;

    FrontFaceCommandNV  cmd;

    NVTokenFrontFace() {
      cmd.header  = s_nvcmdlist_header[ID];
    }

    void setFrontFace(GLenum winding){
      cmd.frontFace = winding == GL_CCW;
    }
  };

#pragma pack(pop)

  template <class T>
  void nvtokenMakeNop(T & token){
    NVTokenNop *nop = (NVTokenNop*)&token;
    for (size_t i = 0; i < (sizeof(T))/4; i++){
      nop[i] = NVTokenNop();
    }
  }

  template <class T>
  size_t nvtokenEnqueue(std::string& queue, T& data)
  {
    size_t offset = queue.size();
    std::string cmd = std::string((const char*)&data,sizeof(T));

    queue += cmd;

    return offset;
  }

  template <class T>
  size_t nvtokenEnqueue(NVPointerStream& queue, T& data)
  {
    assert(queue.m_cur + sizeof(T) <= queue.m_end);
    size_t offset = queue.m_cur - queue.m_begin;

    memcpy(queue.m_cur,&data,sizeof(T));
    queue.m_cur += sizeof(T);

    return offset;
  }
  
  //////////////////////////////////////////////////////////
  
  void        nvtokenInitInternals( bool hwsupport, bool bindlessSupport);
  const char* nvtokenCommandToString( GLenum type );
  void        nvtokenGetStats( const void* NV_RESTRICT stream, size_t streamSize, int stats[NVTOKEN_TYPES]);

  void nvtokenDrawCommandsSW(GLenum mode, const void* NV_RESTRICT stream, size_t streamSize, 
    const GLintptr* NV_RESTRICT offsets, const GLsizei* NV_RESTRICT sizes, 
    GLuint count, 
    StateSystem::State &state);

#if NVTOKEN_STATESYSTEM
  void nvtokenDrawCommandsStatesSW(const void* NV_RESTRICT stream, size_t streamSize, 
    const GLintptr* NV_RESTRICT offsets, const GLsizei* NV_RESTRICT sizes, 
    const GLuint* NV_RESTRICT states, const GLuint* NV_RESTRICT fbos, GLuint count, 
    StateSystem &stateSystem);
#endif
}


================================================
FILE: renderer.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include <assert.h>
#include <algorithm>
#include "renderer.hpp"

#include "common.h"

#pragma pack(1)


namespace csfviewer
{

  //////////////////////////////////////////////////////////////////////////

  bool Renderer::s_bindless_ubo = false;

  CullingSystem   Renderer::s_cullsys;
  ScanSystem      Renderer::s_scansys;

  const char* toString( enum ShadeType st )
  {
    switch(st){
    case SHADE_SOLID: return "solid";
    case SHADE_SOLIDWIRE: return "solid w edges";
    case SHADE_SOLIDWIRE_SPLIT: return "solid w edges (split)";
    }

    return NULL;
  }


  static void FillCache( std::vector<Renderer::DrawItem>& drawItems, const CadScene::Object& obj, const CadScene::Geometry& geo,  bool solid, int objectIndex ) 
  {
    int begin = 0;
    const CadScene::DrawRangeCache &cache = solid ? obj.cacheSolid : obj.cacheWire;

    for (size_t s = 0; s < cache.state.size(); s++)
    {
      const CadScene::DrawStateInfo &state = cache.state[s];
      for (int d = 0; d < cache.stateCount[s]; d++){
        // evict
        Renderer::DrawItem di;
        di.geometryIndex = obj.geometryIndex;
        di.matrixIndex   = state.matrixIndex;
        di.materialIndex = state.materialIndex;
        di.objectIndex   = objectIndex;

        di.solid = solid;
        di.range.offset = cache.offsets[begin + d];
        di.range.count  = cache.counts [begin + d];

        drawItems.push_back(di);
      }
      begin += cache.stateCount[s];
    }
  }

  static void FillJoin( std::vector<Renderer::DrawItem>& drawItems, const CadScene::Object& obj, const CadScene::Geometry& geo,  bool solid, int objectIndex ) 
  {
    CadScene::DrawRange range;

    int lastMaterial = -1;
    int lastMatrix   = -1;

    for (size_t p = 0; p < obj.parts.size(); p++){
      const CadScene::ObjectPart&   part = obj.parts[p];
      const CadScene::GeometryPart& mesh = geo.parts[p];

      if (!part.active) continue;

      if (part.materialIndex != lastMaterial || part.matrixIndex != lastMatrix){

        if (range.count){
          // evict
          Renderer::DrawItem di;
          di.geometryIndex = obj.geometryIndex;
          di.matrixIndex   = lastMatrix;
          di.materialIndex = lastMaterial;
          di.objectIndex   = objectIndex;

          di.solid = solid;
          di.range = range;

          drawItems.push_back(di);
        }

        range = CadScene::DrawRange();

        lastMaterial = part.materialIndex;
        lastMatrix   = part.matrixIndex;
      }

      if (!range.count){
        range.offset = solid ? mesh.indexSolid.offset : mesh.indexWire.offset;
      }

      range.count += solid ? mesh.indexSolid.count : mesh.indexWire.count;
    }

    // evict
    Renderer::DrawItem di;
    di.geometryIndex = obj.geometryIndex;
    di.matrixIndex   = lastMatrix;
    di.materialIndex = lastMaterial;
    di.objectIndex   = objectIndex;

    di.solid = solid;
    di.range = range;

    drawItems.push_back(di);
  }

  static void FillIndividual( std::vector<Renderer::DrawItem>& drawItems, const CadScene::Object& obj, const CadScene::Geometry& geo, bool solid, int objectIndex ) 
  {
    for (size_t p = 0; p < obj.parts.size(); p++){
      const CadScene::ObjectPart&   part = obj.parts[p];
      const CadScene::GeometryPart& mesh = geo.parts[p];

      if (!part.active) continue;

      Renderer::DrawItem di;
      di.geometryIndex = obj.geometryIndex;
      di.matrixIndex   = part.matrixIndex;
      di.materialIndex = part.materialIndex;
      di.objectIndex   = objectIndex;

      di.solid = solid;
      di.range = solid ? mesh.indexSolid : mesh.indexWire;

      drawItems.push_back(di);
    }
  }


  void Renderer::fillDrawItems( std::vector<DrawItem>& drawItems, size_t from, size_t to, bool solid, bool wire )
  {
    const CadScene* NV_RESTRICT scene = m_scene;
    for (size_t i = from; i < scene->m_objects.size() && i < to; i++){
      const CadScene::Object& obj = scene->m_objects[i];
      const CadScene::Geometry& geo = scene->m_geometry[obj.geometryIndex];

      if (m_strategy == STRATEGY_GROUPS){
        if (solid)  FillCache(drawItems, obj, geo, true,  int(i));
        if (wire)   FillCache(drawItems, obj, geo, false, int(i));
      }
      else if (m_strategy == STRATEGY_JOIN) {
        if (solid)  FillJoin(drawItems, obj, geo, true,  int(i));
        if (wire)   FillJoin(drawItems, obj, geo, false, int(i));
      }
      else if (m_strategy == STRATEGY_INDIVIDUAL){
        if (solid)  FillIndividual(drawItems, obj, geo, true,  int(i));
        if (wire)   FillIndividual(drawItems, obj, geo, false, int(i));
      }
    }
  }

}


================================================
FILE: renderer.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#ifndef RENDERER_H__
#define RENDERER_H__

// bindless UBO
#ifndef GL_UNIFORM_BUFFER_UNIFIED_NV
#define GL_UNIFORM_BUFFER_UNIFIED_NV                        0x936E
#endif
#ifndef GL_UNIFORM_BUFFER_ADDRESS_NV
#define GL_UNIFORM_BUFFER_ADDRESS_NV                        0x936F
#endif
#ifndef GL_UNIFORM_BUFFER_LENGTH_NV
#define GL_UNIFORM_BUFFER_LENGTH_NV                         0x9370
#endif

#include "cadscene.hpp"
#include <NvFoundation.h>
#include <nvgl/programmanager_gl.hpp>
#include <nvgl/base_gl.hpp>
#include <nvh/profiler.hpp>
#include "cullingsystem.hpp"
#include "scansystem.hpp"

namespace csfviewer {
  #define USE_NOFILTER           0  // some renderers support turning off redundancy filter

  #define USE_WIRE_SHADERSWITCH  0  // If set we use two different shaders for tris and lines,
                                    // otherwise we use an immediate mode vertexattrib as pseudo uniform toggle.
                                    // Enable this to stress shader switching in app (becomes primary bottleneck)
  enum Strategy {
    STRATEGY_GROUPS,
    STRATEGY_JOIN,
    STRATEGY_INDIVIDUAL,
  };

  enum ShadeType {
    SHADE_SOLID,
    SHADE_SOLIDWIRE,
    SHADE_SOLIDWIRE_SPLIT, // this mode is not "sane" it is only meant for performance testing of fbo toggles
    NUM_SHADES,
  };

  const char* toString(enum ShadeType st);

  struct Resources {
    GLuint    sceneUbo;
    GLuint64  sceneAddr;

    GLuint    programUbo;
    GLuint    programUboTris;
    GLuint    programUboLine;

    GLuint    programIdx;
    GLuint    programIdxTris;
    GLuint    programIdxLine;

    GLuint    fbo;
    GLuint    fbo2;

    size_t    stateChangeID;
    size_t    fboTextureChangeID;

    CullingSystem::View cullView;

    // ugly hack
    mutable GLuint programUsed;
    mutable GLuint programUsedTris;
    mutable GLuint programUsedLine;

    void usingUboProgram(bool ubo=true) const
    {
      programUsed     = ubo ? programUbo     : programIdx;
      programUsedTris = ubo ? programUboTris : programIdxTris;
      programUsedLine = ubo ? programUboLine : programIdxLine;
    }

    Resources() {
      stateChangeID = 0;
      fboTextureChangeID = 0;
    }
  };

#if USE_WIRE_SHADERSWITCH
  #define SetWireMode(state) glUseProgram((state) ? resources.programUsedLine : resources.programUsedTris )
#else
  #define SetWireMode(state) glVertexAttribI1i(VERTEX_WIREMODE,(state))
#endif

  class Renderer {
  public:

    struct DrawItem {
      bool                solid;
      int                 materialIndex;
      int                 geometryIndex;
      int                 matrixIndex;
      int                 objectIndex;
      CadScene::DrawRange range;
    };

    static bool DrawItem_compare_groups(const DrawItem& a, const DrawItem& b)
    {
      int diff = 0;
      diff = diff != 0 ? diff : (a.solid == b.solid ? 0 : ( a.solid ? -1 : 1 ));
      diff = diff != 0 ? diff : (a.materialIndex - b.materialIndex);
      diff = diff != 0 ? diff : (a.geometryIndex - b.geometryIndex);
      diff = diff != 0 ? diff : (a.matrixIndex - b.matrixIndex);

      return diff < 0;
    }

    class Type {
    public:
      Type() {
        getRegistry().push_back(this);
      }

    public:
      virtual bool loadPrograms( nvgl::ProgramManager &mgr ) { return true; }
      virtual void updatedPrograms( nvgl::ProgramManager &mgr ) { }
      virtual bool isAvailable() const = 0;
      virtual const char* name() const = 0;
      virtual Renderer* create() const = 0;
      virtual unsigned int priority() const { return 0xFF; } 
    };

    typedef std::vector<Type*> Registry;

    static bool s_bindless_ubo;
    static Registry& getRegistry()
    {
      static Registry s_registry;
      return s_registry;
    }

    static CullingSystem   s_cullsys;
    static ScanSystem      s_scansys;

  public:
    virtual void init(const CadScene* NV_RESTRICT scene, const Resources& resources) {}
    virtual void deinit() {}
    virtual void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager ) {}
    virtual ~Renderer() {}


    void fillDrawItems( std::vector<DrawItem>& drawItems, size_t from, size_t to, bool solid, bool wire);

    Strategy                    m_strategy;
    const CadScene* NV_RESTRICT  m_scene;
  };
}

#endif


================================================
FILE: rendererindexedmdi.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include <assert.h>
#include <algorithm>
#include "renderer.hpp"

#include "common.h"

#define USE_VERTEX_ASSIGNS  (!USE_BASEINSTANCE)
#define USE_GPU_INDIRECT    1
#define USE_CPU_INDIRECT    (!USE_GPU_INDIRECT)

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

  class RendererIndexedMDI: public Renderer {
  public:
    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "indexedmdi";
      }
      Renderer* create() const
      {
        RendererIndexedMDI* renderer = new RendererIndexedMDI();
        return renderer;
      }
      unsigned int priority() const 
      {
        return 3;
      }
    };
    class TypeVbum : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "indexedmdi_bindless";
      }
      Renderer* create() const
      {
        RendererIndexedMDI* renderer = new RendererIndexedMDI();
        renderer->m_vbum = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 3;
      }
    };
    class TypeSort : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "indexedmdi_sorted";
      }
      Renderer* create() const
      {
        RendererIndexedMDI* renderer = new RendererIndexedMDI();
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 3;
      }
    };
    class TypeSortVbum : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "indexedmdi_sorted_bindless";
      }
      Renderer* create() const
      {
        RendererIndexedMDI* renderer = new RendererIndexedMDI();
        renderer->m_vbum = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 3;
      }
    };

  private:
    struct DrawIndirectGL {
      GLuint count;
      GLuint instanceCount;
      GLuint firstIndex;
      GLint  baseVertex;
      GLuint baseInstance;

      DrawIndirectGL ()
        : count(0)
        , instanceCount(1)
        , firstIndex(0)
        , baseVertex(0)
        , baseInstance(0) {}
    };

    struct IndexedCommand {
      DrawIndirectGL  cmd;
    };

    struct ShadeCommand {
      std::vector<IndexedCommand> indirects;
      std::vector<int>      assigns;

      std::vector<size_t>   sizes;
      std::vector<size_t>   offsets;
      std::vector<int>      geometries;
      std::vector<bool>     solids;

#if USE_GPU_INDIRECT
      GLuint    indirectGL;
      GLuint64  indirectADDR;
#endif

#if USE_VERTEX_ASSIGNS
      GLuint    assignGL;
      GLuint64  assignADDR;
#endif

      ShadeCommand() {
#if USE_GPU_INDIRECT
        indirectGL = 0;
#endif
#if USE_VERTEX_ASSIGNS
        assignGL = 0;
#endif
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);

    bool                        m_vbum;
    bool                        m_sort;


    RendererIndexedMDI()
      : m_vbum(false) 
      , m_sort(false)
    {

    }

  private:

    ShadeCommand    m_shades[NUM_SHADES];
    
    GLuint packBaseInstance( int matrixIndex, int materialIndex )
    {
      assert( materialIndex <= 0xFFF );
      assert( matrixIndex   <= 0xFFFFF );
      return (GLuint(matrixIndex) | (GLuint(materialIndex) << 20));
    }

    void GenerateIndirects(std::vector<DrawItem>& drawItems, ShadeType shade, const CadScene* NV_RESTRICT scene, const Resources& resources )
    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      bool lastSolid   = true;

      ShadeCommand& sc = m_shades[shade];
      sc.assigns.clear();
      sc.indirects.clear();

      sc.sizes.clear();
      sc.offsets.clear();
      sc.solids.clear();
      sc.geometries.clear();

      std::vector<int>& assigns = sc.assigns;
      std::vector<IndexedCommand>& indirectStream = sc.indirects;

      size_t begin = 0;

      int numAssigns = 0;

      for (int i = 0; i < drawItems.size(); i++){
        const DrawItem& di = drawItems[i];

        if (shade == SHADE_SOLID && !di.solid){
          if (m_sort) break;
          continue;
        }

        if (lastGeometry != di.geometryIndex || (shade == SHADE_SOLIDWIRE && di.solid != lastSolid)){
          sc.offsets.push_back( begin );
          sc.sizes.  push_back( GLsizei((indirectStream.size()-begin)) );
          sc.solids. push_back( lastSolid );
          sc.geometries.push_back( lastGeometry );

          begin = indirectStream.size();
        }

#if USE_VERTEX_ASSIGNS
        if (lastMatrix != di.matrixIndex || lastMaterial != di.materialIndex)
        {
          // push indices
          assigns.push_back(di.matrixIndex);
          assigns.push_back(di.materialIndex);
          numAssigns++;

          lastMatrix    = di.matrixIndex;
          lastMaterial  = di.materialIndex;
        }
#endif

        IndexedCommand drawelems;
        drawelems.cmd.count = di.range.count;
        drawelems.cmd.firstIndex = GLuint((di.range.offset )/sizeof(GLuint));
#if USE_VERTEX_ASSIGNS
        drawelems.cmd.baseInstance = numAssigns - 1;
#else
        drawelems.cmd.baseInstance = packBaseInstance(di.matrixIndex, di.materialIndex);
#endif
        indirectStream.push_back(drawelems);

        lastGeometry = di.geometryIndex;
        lastSolid = di.solid;
      }

      sc.offsets.push_back( begin );
      sc.sizes.  push_back( GLsizei((indirectStream.size()-begin)) );
      sc.solids. push_back( lastSolid );
      sc.geometries.push_back( lastGeometry );
    }

  };

  static RendererIndexedMDI::Type s_indexed;
  static RendererIndexedMDI::TypeVbum s_indexed_vbum;
  static RendererIndexedMDI::TypeSort s_indexedsort;
  static RendererIndexedMDI::TypeSortVbum s_indexedsort_vbum;

  void RendererIndexedMDI::init( const CadScene* NV_RESTRICT scene, const Resources& resources )
  {
    m_scene = scene;
    resources.usingUboProgram(false);

    std::vector<DrawItem> drawItems;

    fillDrawItems(drawItems,0,scene->m_objects.size(), true, true);

    if (m_sort){
      std::sort(drawItems.begin(),drawItems.end(),DrawItem_compare_groups);
    }

    // build SC

    GenerateIndirects(drawItems, SHADE_SOLID, scene, resources);
    GenerateIndirects(drawItems, SHADE_SOLIDWIRE, scene, resources);

    for (size_t i = 0; i <= SHADE_SOLIDWIRE; i++){
      ShadeCommand& sc = m_shades[i];
#if USE_GPU_INDIRECT
      glCreateBuffers(1,&sc.indirectGL);
      glNamedBufferStorage( sc.indirectGL, sizeof(IndexedCommand) * sc.indirects.size(), &sc.indirects[0], 0 );
      if (m_vbum){
        glGetNamedBufferParameterui64vNV(sc.indirectGL, GL_BUFFER_GPU_ADDRESS_NV, &sc.indirectADDR);
        glMakeNamedBufferResidentNV(sc.indirectGL, GL_READ_ONLY);
      }
#endif
#if USE_VERTEX_ASSIGNS
      glCreateBuffers(1,&sc.assignGL);
      glNamedBufferStorage( sc.assignGL, sizeof(int) * sc.assigns.size(), &sc.assigns[0], 0 );
      if (m_vbum){
        glGetNamedBufferParameterui64vNV(sc.assignGL, GL_BUFFER_GPU_ADDRESS_NV, &sc.assignADDR);
        glMakeNamedBufferResidentNV(sc.assignGL, GL_READ_ONLY);
      }
#endif
    }

    m_shades[SHADE_SOLIDWIRE_SPLIT] = m_shades[SHADE_SOLIDWIRE];

  }

  void RendererIndexedMDI::deinit()
  {
    for (size_t i = 0; i < SHADE_SOLIDWIRE; i++){
      ShadeCommand& sc = m_shades[i];
      if (m_vbum){
#if USE_GPU_INDIRECT
        glMakeNamedBufferNonResidentNV(sc.indirectGL);
#endif
#if USE_VERTEX_ASSIGNS
        glMakeNamedBufferNonResidentNV(sc.assignGL);
#endif
      }
#if USE_GPU_INDIRECT
      glDeleteBuffers(1,&sc.indirectGL);
#endif
#if USE_VERTEX_ASSIGNS
      glDeleteBuffers(1,&sc.assignGL);
#endif
    }
  }

  void RendererIndexedMDI::draw( ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager )
  {
    const CadScene* NV_RESTRICT scene = m_scene;
    bool vbum = m_vbum;

    scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    glUseProgram(resources.programIdx);

    if (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT){
      glEnable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(1,1);
    }

    SetWireMode(GL_FALSE);

#if USE_VERTEX_ASSIGNS
    glVertexAttribIFormat(VERTEX_ASSIGNS,2,GL_INT,0);
    glVertexAttribBinding(VERTEX_ASSIGNS,1);
    glEnableVertexAttribArray(VERTEX_ASSIGNS);
    glBindVertexBuffer(1,0,0,sizeof(GLint)*2);
    glVertexBindingDivisor(1,1);
#endif
    if (vbum){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
#if USE_GPU_INDIRECT
      glEnableClientState(GL_DRAW_INDIRECT_UNIFIED_NV);
#endif
    }
    if (vbum && s_bindless_ubo){
      glEnableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
      glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV, UBO_MATERIAL, scene->m_materialsADDR, sizeof(CadScene::Material) * scene->m_materials.size() );
      glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV, UBO_SCENE,resources.sceneAddr,sizeof(SceneData));
    }
    else{
      glBindBufferBase(GL_UNIFORM_BUFFER, UBO_SCENE, resources.sceneUbo);
      glBindBufferBase(GL_UNIFORM_BUFFER, UBO_MATERIAL, scene->m_materialsGL);
    }

    nvgl::bindMultiTexture(GL_TEXTURE0 + TEX_MATRICES, GL_TEXTURE_BUFFER, scene->m_matricesTexGL);
    glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);

    {
      ShadeCommand& sc = m_shades[shadetype];
      if (vbum){
  #if USE_GPU_INDIRECT
        glBufferAddressRangeNV(GL_DRAW_INDIRECT_ADDRESS_NV, 0,       sc.indirectADDR, sc.indirects.size() * sizeof(IndexedCommand) );
  #endif
  #if USE_VERTEX_ASSIGNS
        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 1, sc.assignADDR, sc.assigns.size() * sizeof(GLint));
  #endif
      }
      else{
  #if USE_GPU_INDIRECT
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, sc.indirectGL);
  #endif
  #if USE_VERTEX_ASSIGNS
        glBindVertexBuffer(1, sc.assignGL, 0, sizeof(GLint)*2);
  #endif
      }
  #if USE_CPU_INDIRECT
      size_t offset = (size_t)&sc.indirects[0];
  #else
      size_t offset = 0;
  #endif

      int lastGeometry = -1;
      bool lastSolid  = true;
      for (size_t i = 0; i < sc.geometries.size(); i++){
        int geometryIndex = sc.geometries[i];

        if (geometryIndex != lastGeometry){
          const CadScene::Geometry& geo = m_scene->m_geometry[ geometryIndex ];
          if (vbum){
            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0,  geo.vboADDR, geo.numVertices * sizeof(CadScene::Vertex));
            glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV,0,         geo.iboADDR, (geo.numIndexSolid+geo.numIndexWire) * sizeof(GLuint));
          }
          else{
            glBindVertexBuffer(0, geo.vboGL, 0, sizeof(CadScene::Vertex));
            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, geo.iboGL);
          }
          lastGeometry = geometryIndex;
        }

        bool solid = sc.solids[i];
        if (solid != lastSolid){
          SetWireMode((!solid));
        }

        glMultiDrawElementsIndirect(solid ? GL_TRIANGLES : GL_LINES,GL_UNSIGNED_INT, (const void*)(offset + sc.offsets[i] * sizeof(IndexedCommand)), GLsizei(sc.sizes[i]), 0);

        lastSolid = solid;
      }
    }
#if USE_VERTEX_ASSIGNS
    glDisableVertexAttribArray(VERTEX_ASSIGNS);
    glBindVertexBuffer(1,0,0,0);
    glVertexBindingDivisor(1,0);
#endif

    glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
    nvgl::bindMultiTexture(GL_TEXTURE0 + TEX_MATRICES, GL_TEXTURE_BUFFER, 0);

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    if (vbum){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
#if USE_GPU_INDIRECT
      glDisableClientState(GL_DRAW_INDIRECT_UNIFIED_NV);
#endif
      if (s_bindless_ubo){
        glDisableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
      }
    }

    if (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT){
      glDisable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(0,0);
    }

    SetWireMode(GL_FALSE);

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

  }

}


================================================
FILE: renderertoken.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "tokenbase.hpp"

#include "common.h"

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

  class RendererToken: public Renderer, public TokenRendererBase {
  public:
    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenbuffer";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };
    class TypeAddr : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenbuffer_address";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_useaddress = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };
    class TypeList : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenlist";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_uselist = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 8;
      }
    };
    class TypeEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "tokenbuffer_emulated";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_emulate = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };

    class TypeSort : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenbuffer_sorted";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };
    class TypeSortAddr : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenbuffer_sorted_address";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_useaddress = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };
    class TypeSortList : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenlist_sorted";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_uselist = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 8;
      }
    };
    class TypeSortEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "tokenbuffer_sorted_emulated";
      }
      Renderer* create() const
      {
        RendererToken* renderer = new RendererToken();
        renderer->m_emulate = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);

  private:

    std::vector<DrawItem>       m_drawItems;

    void GenerateTokens(std::vector<DrawItem>& drawItems, ShadeType shade, const CadScene* NV_RESTRICT scene, const Resources& resources )
    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      bool lastSolid   = true;

      ShadeCommand& sc = m_shades[shade];
      sc.fbos.clear();
      sc.offsets.clear();
      sc.sizes.clear();
      sc.states.clear();
      
      std::string& tokenStream = m_tokenStreams[shade];
      tokenStream.clear();

      size_t begin = 0;

      {
        NVTokenUbo ubo;
        ubo.cmd.index   = UBO_SCENE;
        ubo.cmd.stage   = UBOSTAGE_VERTEX;
        ubo.setBuffer(resources.sceneUbo, resources.sceneAddr, 0, sizeof(SceneData));
        nvtokenEnqueue(tokenStream, ubo);

        ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
        nvtokenEnqueue(tokenStream, ubo);

#if USE_POLYOFFSETTOKEN
        NVTokenPolygonOffset offset;
        offset.cmd.bias = 1;
        offset.cmd.scale = 1;
        nvtokenEnqueue(tokenStream, offset);
#endif
      }

      for (int i = 0; i < drawItems.size(); i++){
        const DrawItem& di = drawItems[i];

        if (shade == SHADE_SOLID && !di.solid){
          continue;
        }

        if (shade == SHADE_SOLIDWIRE && di.solid != lastSolid){
          sc.offsets.push_back( begin );
          sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
          sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
          sc.fbos.   push_back( 0 );

          begin = tokenStream.size();
        }

        if (lastGeometry != di.geometryIndex){
          const CadScene::Geometry &geo = scene->m_geometry[di.geometryIndex];
          NVTokenVbo vbo;
          vbo.cmd.index = 0;
          vbo.setBuffer(geo.vboGL, geo.vboADDR, 0);
          nvtokenEnqueue(tokenStream, vbo);

          NVTokenIbo ibo;
          ibo.setBuffer(geo.iboGL, geo.iboADDR);
          ibo.cmd.typeSizeInByte = 4;
          nvtokenEnqueue(tokenStream, ibo);

          lastGeometry = di.geometryIndex;
        }

        if (lastMatrix != di.matrixIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATRIX;
          ubo.cmd.stage   = UBOSTAGE_VERTEX;
          ubo.setBuffer(scene->m_matricesGL, scene->m_matricesADDR, sizeof(CadScene::MatrixNode) * di.matrixIndex, sizeof(CadScene::MatrixNode));
          nvtokenEnqueue(tokenStream, ubo);

          lastMatrix = di.matrixIndex;
        }

        if (lastMaterial != di.materialIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATERIAL;
          ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
          ubo.setBuffer(scene->m_materialsGL, scene->m_materialsADDR, sizeof(CadScene::Material) * di.materialIndex, sizeof(CadScene::Material));
          nvtokenEnqueue(tokenStream, ubo);

          lastMaterial = di.materialIndex;
        }


        NVTokenDrawElemsUsed drawelems;
        drawelems.setMode(di.solid ? GL_TRIANGLES : GL_LINES);
        drawelems.cmd.count = di.range.count;
        drawelems.cmd.firstIndex = GLuint((di.range.offset )/sizeof(GLuint));
        nvtokenEnqueue(tokenStream, drawelems);

        lastSolid = di.solid;
      }

      sc.offsets.push_back( begin );
      sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
      if (shade == SHADE_SOLID){
        sc.states. push_back( m_stateObjects[ STATE_TRIS ] );
      }
      else{
        sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
      }
      sc.fbos. push_back( 0 );

    }

  };
  static RendererToken::Type      s_token;
  static RendererToken::TypeAddr  s_token_addr;
  static RendererToken::TypeList  s_token_list;
  static RendererToken::TypeEmu   s_token_emu;

  static RendererToken::TypeSort      s_sorttoken;
  static RendererToken::TypeSortAddr  s_sorttoken_addr;
  static RendererToken::TypeSortList  s_sorttoken_list;
  static RendererToken::TypeSortEmu   s_sorttoken_emu;

  void RendererToken::init(const CadScene* NV_RESTRICT scene, const Resources& resources)
  {
    TokenRendererBase::init(s_bindless_ubo, !!has_GL_NV_vertex_buffer_unified_memory);
    resources.usingUboProgram(true);

    m_scene = scene;

    std::vector<DrawItem> drawItems;

    fillDrawItems(drawItems,0,scene->m_objects.size(), true, true);

    if (USE_PERFRAMEBUILD){
      m_drawItems = drawItems;
    }

    if (m_sort){
      std::sort(drawItems.begin(),drawItems.end(),DrawItem_compare_groups);
    }

    GenerateTokens(drawItems, SHADE_SOLID, scene, resources);

    TokenRendererBase::printStats(SHADE_SOLID);

    GenerateTokens(drawItems, SHADE_SOLIDWIRE, scene, resources);

    TokenRendererBase::printStats(SHADE_SOLIDWIRE);

    TokenRendererBase::finalize(resources);
  }

  void RendererToken::deinit()
  {
    TokenRendererBase::deinit();
    m_drawItems.clear();
  }

  void RendererToken::draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager)
  {
    const CadScene* NV_RESTRICT scene = m_scene;

    // do state setup (primarily for sake of state capturing)
    scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    if (m_bindlessVboUbo){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }
    else{
      glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,resources.sceneUbo);
    }

    if (USE_PERFRAMEBUILD){

#if 0
      std::vector<DrawItem> drawItems;
      {
        nvh::Profiler::Section _tempTimer(profiler ,"Copy");
        drawItems = m_drawItems;
      }
#else
      std::vector<DrawItem>& drawItems = m_drawItems;
#endif
      {
        nvh::Profiler::Section _tempTimer(profiler ,"Sort");
        std::sort(drawItems.begin(),drawItems.end(),DrawItem_compare_groups);
      }

      {
        nvh::Profiler::Section _tempTimer(profiler ,"Token");
        GenerateTokens(drawItems, shadetype, scene, resources);
      }

      if (!m_emulate && !m_uselist){
        nvh::Profiler::Section _tempTimer(profiler ,"Build");
        ShadeCommand & shade =  m_shades[shadetype];
        glInvalidateBufferData(m_tokenBuffers[shadetype]);
        glNamedBufferSubData(m_tokenBuffers[shadetype],shade.offsets[0], m_tokenStreams[shadetype].size(), &m_tokenStreams[shadetype][0]);
      }
    }

    if (USE_STATEOBJ_REBUILD){
      nvh::Profiler::Section section(profiler,"state");
      for (int i = 0; i < 25; i++){
        m_stateChangeID = resources.stateChangeID + 1;
        m_fboStateChangeID = resources.fboTextureChangeID + 1;
        captureState(resources);
      }
    }
    else{
      captureState(resources);
    }

    if (!USE_POLYOFFSETTOKEN && (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT)){
      glPolygonOffset(1,1);
    }

    if (m_hwsupport){
      if (m_uselist){
        glCallCommandListNV(m_commandLists[shadetype]);
      }
      else{
        ShadeCommand & shade =  m_shades[shadetype];
        if (m_useaddress){
          glDrawCommandsStatesAddressNV(&shade.addresses[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], int(shade.states.size()) );
        }
        else{
          glDrawCommandsStatesNV(m_tokenBuffers[shadetype], &shade.offsets[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], int(shade.states.size()) );
        }
      }
    }
    else{
      ShadeCommand & shade =  m_shades[shadetype];
      std::string& stream  =  m_tokenStreams[shadetype];
      renderShadeCommandSW(&stream[0], stream.size(), shade);
    }

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    glDisable(GL_POLYGON_OFFSET_FILL);
    glPolygonOffset(0,0);

    if (m_bindlessVboUbo){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);
  }

}


================================================
FILE: renderertokensortcull.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "tokenbase.hpp"
#include "cullingsystem.hpp"

#include "common.h"

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

#define USE_TEMPORALRASTER      1
#define USE_OBJECTSORT_CULLING  1


  class RendererCullSortToken : public Renderer, public TokenRendererBase {
  public:
    class Shared {
    public:
      nvgl::ProgramID 
        token_sizes,
        token_scan,
        token_cmds;

      static Shared& get()
      {
        static Shared res;
        return res;
      }

      Shared() : loaded(false) {}

      bool load(nvgl::ProgramManager &progManager)
      {
        if (loaded) return true;

        loaded = true;

        token_sizes = progManager.createProgram(
          nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "cull-tokensizes.vert.glsl"));
        token_cmds = progManager.createProgram(
          nvgl::ProgramManager::Definition(GL_VERTEX_SHADER, "cull-tokencmds.vert.glsl"));

        if (!progManager.areProgramsValid()) return false;

        return true;
      }

    private:
      bool loaded;
    };

    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenbuffer_cullsorted";
      }
      Renderer* create() const
      {
        RendererCullSortToken* renderer = new RendererCullSortToken();
        return renderer;
      }
      bool loadPrograms( nvgl::ProgramManager &mgr)
      {
        return Shared::get().load(mgr);
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };
    class TypeEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "tokenbuffer_cullsorted_emulated";
      }
      Renderer* create() const
      {
        RendererCullSortToken* renderer = new RendererCullSortToken();
        renderer->m_emulate = true;
        return renderer;
      }
      bool loadPrograms( nvgl::ProgramManager &mgr )
      {
        return Shared::get().load(mgr);
      }
      unsigned int priority() const 
      {
        return 9;
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);
    void drawScene(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager, const char*what);

  private:

    static bool DrawItem_compare_groups(const DrawItem& a, const DrawItem& b)
    {
      int diff = 0;
      diff = diff != 0 ? diff : (a.solid == b.solid ? 0 : ( a.solid ? -1 : 1 ));
#if USE_OBJECTSORT_CULLING
      diff = diff != 0 ? diff : (a.objectIndex - b.objectIndex);
#endif
      diff = diff != 0 ? diff : (a.materialIndex - b.materialIndex);
      diff = diff != 0 ? diff : (a.geometryIndex - b.geometryIndex);
      diff = diff != 0 ? diff : (a.matrixIndex - b.matrixIndex);

      return diff < 0;
    }

    struct CullSequence {
      GLuint    offset;
      GLint     endoffset;
      int       first;
      int       num;
    };

    struct CullShade {
      GLuint                    numTokens;
      std::vector<CullSequence> sequnces;

      // static buffers
      ScanSystem::Buffer   tokenOrig;

      // for each command, #cmds rounded to multiple of 4
      ScanSystem::Buffer   tokenSizes;   // in integers
      ScanSystem::Buffer   tokenObjects; // -1 if no drawcall, otherwise object
      ScanSystem::Buffer   tokenOffsets; // offsets for each command

      ScanSystem::Buffer   tokenOutSizes;
      ScanSystem::Buffer   tokenOutScan;
      ScanSystem::Buffer   tokenOutScanOffset;
    };

    class CullJobToken : public CullingSystem::Job
    {
    public:
      void resultFromBits( const CullingSystem::Buffer& bufferVisBitsCurrent );

      GLuint      program_sizes;
      GLuint      program_cmds;

      // dynamic
      ScanSystem::Buffer   tokenOut;

      CullShade* NV_RESTRICT cullshade;
    };

    std::vector<DrawItem>       m_drawItems;

    CullJobToken                m_culljob;
    CullShade                   m_cullshades[NUM_SHADES];
    GLuint                      m_maxGrps;

    void PrepareCullJob(ShadeType shade);


    template <class T>
    static void handleToken(std::vector<GLuint> &tokenSizes, std::vector<GLuint> &tokenOffsets,std::vector<GLint>&  tokenObjects, T &token, size_t stream, int obj=-1)
    {
      tokenSizes.push_back(GLuint(sizeof(T) / sizeof(GLuint) ));
      tokenOffsets.push_back(GLuint( (stream - sizeof(T))/ sizeof(GLuint) ));
      tokenObjects.push_back(obj);
    }

    void GenerateTokens(std::vector<DrawItem>& drawItems, ShadeType shade, const CadScene* NV_RESTRICT scene, const Resources& resources )
    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      int lastObject   = -1;
      bool lastSolid   = true;

      ShadeCommand& sc = m_shades[shade];
      CullShade& cull = m_cullshades[shade];

      sc.fbos.clear();
      sc.offsets.clear();
      sc.sizes.clear();
      sc.states.clear();

      std::string& tokenStream = m_tokenStreams[shade];
      tokenStream.clear();


      cull.numTokens = 0;
      GLuint beginToken = 0;

      size_t begin = 0;
      size_t start = begin;

      std::vector<GLuint> tokenSizes;
      std::vector<GLuint> tokenOffsets;
      std::vector<GLint>  tokenObjects;

      {
        NVTokenUbo ubo;
        ubo.cmd.index   = UBO_SCENE;
        ubo.cmd.stage   = UBOSTAGE_VERTEX;
        ubo.setBuffer(resources.sceneUbo, resources.sceneAddr, 0, sizeof(SceneData) );
        nvtokenEnqueue(tokenStream, ubo);
        handleToken(tokenSizes,tokenOffsets,tokenObjects, ubo, tokenStream.size()-start, -1);
        cull.numTokens++;

        ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
        nvtokenEnqueue(tokenStream, ubo);
        handleToken(tokenSizes,tokenOffsets,tokenObjects, ubo, tokenStream.size()-start, -1);
        cull.numTokens++;

#if USE_POLYOFFSETTOKEN
        NVTokenPolygonOffset offset;
        offset.cmd.bias = 1;
        offset.cmd.scale = 1;
        nvtokenEnqueue(tokenStream, offset);
        handleToken(tokenSizes,tokenOffsets,tokenObjects, offset, tokenStream.size()-start, -1);
        cull.numTokens++;
#endif
      }

      for (int i = 0; i < drawItems.size(); i++){
        const DrawItem& di = drawItems[i];

        if (shade == SHADE_SOLID && !di.solid){
          continue;
        }

        int bufferObjIndex = -1;
#if USE_OBJECTSORT_CULLING
        bufferObjIndex = di.objectIndex;
        if (di.objectIndex != lastObject || di.solid != lastSolid){
          // whenever an object changes or we switches from solid to edges (happens only once in this sorted scenario)
          // we have to ensure all buffers are reset as well
          lastObject = di.objectIndex;
          lastMaterial = -1;
          lastGeometry = -1;
          lastMatrix   = -1;
        }
#endif

        if (shade == SHADE_SOLIDWIRE && di.solid != lastSolid){
          sc.offsets.push_back( begin );
          sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
          sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
          sc.fbos.   push_back( 0 );
          CullSequence cullseq;
          cullseq.num     = cull.numTokens - beginToken;
          cullseq.first   = beginToken;
          cullseq.offset  = GLuint((begin-start)/sizeof(GLuint));
          cullseq.endoffset = GLuint((tokenStream.size()-start)/sizeof(GLuint));
          cull.sequnces.push_back(cullseq);

          beginToken = cull.numTokens;
          begin = tokenStream.size();
        }

        if (lastGeometry != di.geometryIndex){
          const CadScene::Geometry &geo = scene->m_geometry[di.geometryIndex];
          NVTokenVbo vbo;
          vbo.cmd.index = 0;
          vbo.setBuffer(geo.vboGL, geo.vboADDR, 0);

          nvtokenEnqueue(tokenStream, vbo);
          handleToken(tokenSizes,tokenOffsets,tokenObjects, vbo, tokenStream.size()-start, bufferObjIndex);
          cull.numTokens++;

          NVTokenIbo ibo;
          ibo.setBuffer(geo.iboGL, geo.iboADDR);
          ibo.cmd.typeSizeInByte = 4;
          nvtokenEnqueue(tokenStream, ibo);
          handleToken(tokenSizes,tokenOffsets,tokenObjects, vbo, tokenStream.size()-start, bufferObjIndex);
          cull.numTokens++;

          lastGeometry = di.geometryIndex;
        }

        if (lastMatrix != di.matrixIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATRIX;
          ubo.cmd.stage   = UBOSTAGE_VERTEX;
          ubo.setBuffer(scene->m_matricesGL, scene->m_matricesADDR, sizeof(CadScene::MatrixNode) * di.matrixIndex, sizeof(CadScene::MatrixNode) );
          nvtokenEnqueue(tokenStream, ubo);
          handleToken(tokenSizes,tokenOffsets,tokenObjects, ubo, tokenStream.size()-start, bufferObjIndex);
          cull.numTokens++;

          lastMatrix = di.matrixIndex;
        }

        if (lastMaterial != di.materialIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATERIAL;
          ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
          ubo.setBuffer(scene->m_materialsGL, scene->m_materialsADDR, sizeof(CadScene::Material) * di.materialIndex, sizeof(CadScene::Material) );
          nvtokenEnqueue(tokenStream, ubo);
          handleToken(tokenSizes,tokenOffsets,tokenObjects, ubo, tokenStream.size()-start, bufferObjIndex);
          cull.numTokens++;

          lastMaterial = di.materialIndex;
        }


        NVTokenDrawElemsUsed drawelems;
        drawelems.setMode(di.solid ? GL_TRIANGLES : GL_LINES);
        drawelems.cmd.count = di.range.count;
        drawelems.cmd.firstIndex = GLuint((di.range.offset )/sizeof(GLuint));
        nvtokenEnqueue(tokenStream, drawelems);
        handleToken(tokenSizes,tokenOffsets,tokenObjects, drawelems, tokenStream.size()-start, di.objectIndex);
        cull.numTokens++;

        lastSolid = di.solid;
      }

      sc.offsets.push_back( begin );
      sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
      if (shade == SHADE_SOLID){
        sc.states. push_back( m_stateObjects[ STATE_TRIS ] );
      }
      else{
        sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
      }
      sc.fbos. push_back( 0 );

      CullSequence cullseq;
      cullseq.num     = cull.numTokens - beginToken;
      cullseq.first   = beginToken;
      cullseq.offset  = GLuint((begin-start)/sizeof(GLuint));
      cullseq.endoffset = GLuint((tokenStream.size()-start)/sizeof(GLuint));
      cull.sequnces.push_back(cullseq);

      // create buffers for culling
      cull.tokenOrig.create(tokenStream.size() - start,&tokenStream[start], 0);

      cull.tokenOffsets.create(sizeof(GLuint)*cull.numTokens,&tokenOffsets[0], 0);
      cull.tokenSizes.  create(sizeof(GLuint)*cull.numTokens,&tokenSizes[0], 0);
      cull.tokenObjects.create(sizeof(GLint)*cull.numTokens,&tokenObjects[0], 0);

      int round4 = ((cull.numTokens+3)/4)*4;

      cull.tokenOutScan.      create(sizeof(GLuint)*round4,NULL, 0);
      cull.tokenOutScanOffset.create(std::max(ScanSystem::getOffsetSize(round4), size_t(16)),NULL, 0);
      cull.tokenOutSizes.     create(sizeof(GLuint)*round4,NULL, 0);
    }

  };


  // not yet fully implemented
  static RendererCullSortToken::Type s_cullsorttoken;
  static RendererCullSortToken::TypeEmu s_cullsorttoken_emu;


  void RendererCullSortToken::init(const CadScene* NV_RESTRICT scene, const Resources& resources)
  {
    TokenRendererBase::init(s_bindless_ubo, !!has_GL_NV_vertex_buffer_unified_memory);
    resources.usingUboProgram(true);

    m_scene = scene;
    glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT,0,(GLint*)&m_maxGrps);

    std::vector<DrawItem> drawItems;

    fillDrawItems(drawItems,0,scene->m_objects.size(), true, true);

    std::sort(drawItems.begin(),drawItems.end(),DrawItem_compare_groups);

    GenerateTokens(drawItems, SHADE_SOLID, scene, resources);

    TokenRendererBase::printStats(SHADE_SOLID);

    GenerateTokens(drawItems, SHADE_SOLIDWIRE, scene, resources);

    TokenRendererBase::printStats(SHADE_SOLIDWIRE);

    TokenRendererBase::finalize(resources);

    if (m_emulate){
      for (int i = 0; i < NUM_SHADES; i++){
        glNamedBufferStorage(m_tokenBuffers[i], m_tokenStreams[i].size(), &m_tokenStreams[i][0], GL_MAP_READ_BIT);
      }
    }

    m_culljob.m_numObjects = int(m_scene->m_objects.size());

    int roundedBits = (m_culljob.m_numObjects+31)/32;
    int roundedInts = roundedBits*32;

    m_culljob.m_bufferBboxes    = CullingSystem::Buffer(m_scene->m_geometryBboxesGL, sizeof(CadScene::BBox) * m_scene->m_geometryBboxes.size());
    m_culljob.m_bufferMatrices  = CullingSystem::Buffer(m_scene->m_matricesGL, sizeof(CadScene::MatrixNode) * m_scene->m_matrices.size());
    m_culljob.m_bufferObjectMatrix  = CullingSystem::Buffer(m_scene->m_objectAssignsGL, sizeof(GLint)*2* m_scene->m_objectAssigns.size());
    m_culljob.m_bufferObjectMatrix.stride = sizeof(GLint)*2;
    m_culljob.m_bufferObjectBbox    = m_culljob.m_bufferObjectMatrix;
    m_culljob.m_bufferObjectBbox.offset = sizeof(GLint);
    m_culljob.m_bufferObjectBbox.size  -= sizeof(GLint);
    m_culljob.m_bufferObjectBbox.stride = sizeof(GLint)*2;

    m_culljob.m_bufferVisBitsCurrent.create(sizeof(int)*roundedBits,NULL,0);
    GLuint full = ~0;
    glClearNamedBufferData(m_culljob.m_bufferVisBitsCurrent.buffer,GL_R32UI,GL_RED_INTEGER,GL_UNSIGNED_INT,&full);
    m_culljob.m_bufferVisBitsLast.create(sizeof(int)*roundedBits,NULL,0);
    glClearNamedBufferData(m_culljob.m_bufferVisBitsLast.buffer,GL_R32UI,GL_RED_INTEGER,GL_UNSIGNED_INT,0);

    m_culljob.m_bufferVisOutput.create(sizeof(int)*roundedInts,NULL,0);
    m_cullshades[SHADE_SOLIDWIRE_SPLIT] = m_cullshades[SHADE_SOLIDWIRE];
  }

  void RendererCullSortToken::deinit()
  {
    for (int i = 0; i < 2; i++){
      CullShade &cs = m_cullshades[i];
      glDeleteBuffers(1,&cs.tokenOrig.buffer);
      glDeleteBuffers(1,&cs.tokenOffsets.buffer);
      glDeleteBuffers(1,&cs.tokenSizes.buffer);
      glDeleteBuffers(1,&cs.tokenObjects.buffer);

      glDeleteBuffers(1,&cs.tokenOutScan.buffer);
      glDeleteBuffers(1,&cs.tokenOutScanOffset.buffer);
      glDeleteBuffers(1,&cs.tokenOutSizes.buffer);
    }

    glDeleteBuffers(1,&m_culljob.m_bufferVisBitsCurrent.buffer);
    glDeleteBuffers(1,&m_culljob.m_bufferVisBitsLast.buffer);
    glDeleteBuffers(1,&m_culljob.m_bufferVisOutput.buffer);


    TokenRendererBase::deinit();
    m_drawItems.clear();
  }

  void RendererCullSortToken::PrepareCullJob(ShadeType shade)
  {
    ShadeCommand& sc = m_shades[shade];
    RendererCullSortToken::CullJobToken& job = m_culljob;

    job.cullshade = &m_cullshades[shade];

    // setup buffer offsets
    job.tokenOut.buffer = m_tokenBuffers[shade];
    job.tokenOut.offset = sc.offsets[0];
    job.tokenOut.size   = m_cullshades[shade].tokenOrig.size;
  }

  void RendererCullSortToken::CullJobToken::resultFromBits( const CullingSystem::Buffer& bufferVisBitsCurrent )
  {
    // first compute sizes based on culling result
    glUseProgram(program_sizes);

    glBindBuffer(GL_ARRAY_BUFFER, cullshade->tokenSizes.buffer);
    glVertexAttribIPointer(0,1,GL_UNSIGNED_INT,0,(const void*)cullshade->tokenSizes.offset);
    glBindBuffer(GL_ARRAY_BUFFER, cullshade->tokenObjects.buffer);
    glVertexAttribIPointer(1,1,GL_INT,0,(const void*)cullshade->tokenObjects.offset);

    glEnableVertexAttribArray(0);
    glEnableVertexAttribArray(1);

    cullshade->tokenOutSizes.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
    bufferVisBitsCurrent.BindBufferRange(GL_SHADER_STORAGE_BUFFER,1);

    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);

    GLuint numTokens = cullshade->numTokens;

    glEnable(GL_RASTERIZER_DISCARD);
    glDrawArrays(GL_POINTS,0, numTokens);

    glDisableVertexAttribArray(0);
    glDisableVertexAttribArray(1);

    Renderer::s_scansys.scanData(((numTokens+3)/4)*4,cullshade->tokenOutSizes,cullshade->tokenOutScan,cullshade->tokenOutScanOffset);

    glUseProgram(program_cmds);
    glUniform1ui(glGetUniformLocation(program_cmds,"terminateCmd"),s_nvcmdlist_header[GL_TERMINATE_SEQUENCE_COMMAND_NV]);

    glBindBuffer(GL_ARRAY_BUFFER, cullshade->tokenOffsets.buffer);
    glVertexAttribIPointer(0,1,GL_UNSIGNED_INT,0,(const void*)cullshade->tokenOffsets.offset);
    glBindBuffer(GL_ARRAY_BUFFER, cullshade->tokenOutSizes.buffer);
    glVertexAttribIPointer(1,1,GL_UNSIGNED_INT,0,(const void*)cullshade->tokenOutSizes.offset);
    glBindBuffer(GL_ARRAY_BUFFER, cullshade->tokenOutScan.buffer);
    glVertexAttribIPointer(2,1,GL_UNSIGNED_INT,0,(const void*)cullshade->tokenOutScan.offset);

    glEnableVertexAttribArray(0);
    glEnableVertexAttribArray(1);
    glEnableVertexAttribArray(2);

    tokenOut.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
    cullshade->tokenOrig.BindBufferRange(GL_SHADER_STORAGE_BUFFER,1);
    cullshade->tokenOutSizes.BindBufferRange(GL_SHADER_STORAGE_BUFFER,2);
    cullshade->tokenOutScan.BindBufferRange(GL_SHADER_STORAGE_BUFFER,3);
    cullshade->tokenOutScanOffset.BindBufferRange(GL_SHADER_STORAGE_BUFFER,4);

    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);

    for (GLuint i = 0; i < cullshade->sequnces.size() ; i++){
      glUniform1ui(glGetUniformLocation(program_cmds,"startOffset"),cullshade->sequnces[i].offset);
      glUniform1i (glGetUniformLocation(program_cmds,"startID"),cullshade->sequnces[i].first);
      glUniform1ui(glGetUniformLocation(program_cmds,"endOffset"),cullshade->sequnces[i].endoffset);
      glUniform1i (glGetUniformLocation(program_cmds,"endID"),cullshade->sequnces[i].first + cullshade->sequnces[i].num - 1);
      glDrawArrays(GL_POINTS,cullshade->sequnces[i].first,cullshade->sequnces[i].num);
    }

    glDisableVertexAttribArray(0);
    glDisableVertexAttribArray(1);
    glDisableVertexAttribArray(2);

    glBindBuffer(GL_ARRAY_BUFFER,0);

    for (GLuint i = 0; i < 5; i++){
      glBindBufferBase(GL_SHADER_STORAGE_BUFFER,i,0);
    }

    glDisable(GL_RASTERIZER_DISCARD);
  }

  void RendererCullSortToken::drawScene(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager, const char*what)
  {
    const CadScene* NV_RESTRICT scene = m_scene;

    nvh::Profiler::Section  section(profiler,what);

    // do state setup (primarily for sake of state capturing)
    m_scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    if (m_bindlessVboUbo){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }
    else{
      glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,resources.sceneUbo);
    }

    captureState(resources);

    if (!USE_POLYOFFSETTOKEN && (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT)){
      glPolygonOffset(1,1);
    }

    if (m_hwsupport){
      if (m_uselist){
        glCallCommandListNV(m_commandLists[shadetype]);
      }
      else{
        ShadeCommand & shade =  m_shades[shadetype];
        glDrawCommandsStatesNV(m_tokenBuffers[shadetype], &shade.offsets[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], int(shade.states.size()) );
      }
    }
    else{
      ShadeCommand & shade =  m_shades[shadetype];
      std::string& stream  =  m_tokenStreams[shadetype];
      renderShadeCommandSW(&stream[0], stream.size(), shade);
    }

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    glDisable(GL_POLYGON_OFFSET_FILL);
    glPolygonOffset(0,0);

    if (m_bindlessVboUbo){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);
  }


#define CULL_TEMPORAL_NOFRUSTUM 1

  void RendererCullSortToken::draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager)
  {
    // broken in other types atm
    //shadetype = SHADE_SOLID;

    m_culljob.program_cmds  = progManager.get( Shared::get().token_cmds );
    m_culljob.program_sizes = progManager.get( Shared::get().token_sizes );

    PrepareCullJob(shadetype);

    CullingSystem& cullSys = Renderer::s_cullsys;


#if !USE_TEMPORALRASTER

    {
      nvh::Profiler::Section section(profiler,"CullF");
      cullSys.buildOutput( CullingSystem::METHOD_FRUSTUM, m_culljob, resources.cullView );
      cullSys.bitsFromOutput( m_culljob, CullingSystem::BITS_CURRENT );
      {
        nvh::Profiler::Section section(profiler,"ResF");
        cullSys.resultFromBits( m_culljob );
      }

      if (m_emulate){
        nvh::Profiler::Section read(profiler,"Read");
        m_culljob.tokenOut.GetNamedBufferSubData(&m_tokenStream[m_culljob.tokenOut.offset]);
        GLuint* first = (GLuint*)&m_tokenStream[m_culljob.tokenOut.offset];
        first[0] = first[0];
      }
      else {
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m_culljob.tokenOut.buffer);
        glMemoryBarrier(GL_COMMAND_BARRIER_BIT);
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
        //glFinish();
      }
    }

    drawScene(shadetype,resources,profiler,progManager, "Last");

#else

    {
      nvh::Profiler::Section section(profiler,"CullF");
#if CULL_TEMPORAL_NOFRUSTUM
      {
        nvh::Profiler::Section section(profiler,"ResF");
        cullSys.resultFromBits( m_culljob );
      }
      cullSys.swapBits( m_culljob );  // last/output
#else
      cullSys.buildOutput( CullingSystem::METHOD_FRUSTUM, m_culljob, resources.cullView );
      cullSys.bitsFromOutput( m_culljob, CullingSystem::BITS_CURRENT_AND_LAST );
      {
        nvh::Profiler::Section section(profiler,"ResF");
        cullSys.resultFromBits( m_culljob );
      }
#endif
      if (m_emulate){
        nvh::Profiler::Section read(profiler,"Read");
        void* data = &m_tokenStreams[shadetype][m_culljob.tokenOut.offset];
        m_culljob.tokenOut.GetNamedBufferSubData(data);
      }
      else {
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m_culljob.tokenOut.buffer);
        glMemoryBarrier(GL_COMMAND_BARRIER_BIT);
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
        //glFinish();
      }
    }

    drawScene(shadetype,resources,profiler,progManager, "Last");

    {
      nvh::Profiler::Section section(profiler,"CullR");
      cullSys.buildOutput( CullingSystem::METHOD_RASTER, m_culljob, resources.cullView );
      cullSys.bitsFromOutput( m_culljob, CullingSystem::BITS_CURRENT_AND_NOT_LAST );
      {
        nvh::Profiler::Section section(profiler,"ResR");
        cullSys.resultFromBits( m_culljob );
      }

      // for next frame
      cullSys.bitsFromOutput( m_culljob, CullingSystem::BITS_CURRENT );
#if !CULL_TEMPORAL_NOFRUSTUM
      cullSys.swapBits( m_culljob );  // last/output
#endif
      if (m_emulate){
        nvh::Profiler::Section read(profiler,"Read");
        void* data = &m_tokenStreams[shadetype][m_culljob.tokenOut.offset];
        m_culljob.tokenOut.GetNamedBufferSubData(data);
      }
      else {
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, m_culljob.tokenOut.buffer);
        glMemoryBarrier(GL_COMMAND_BARRIER_BIT);
        glBindBuffer(GL_DRAW_INDIRECT_BUFFER, 0);
        //glFinish();
      }
    }

    drawScene(shadetype,resources,profiler,progManager, "New");
#endif
  }

}


================================================
FILE: renderertokenstream.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "tokenbase.hpp"

#include "common.h"

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

  class RendererTokenStream: public Renderer, public TokenRendererBase {
  public:
    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return TokenRendererBase::hasNativeCommandList();
      }
      const char* name() const
      {
        return "tokenstream";
      }
      Renderer* create() const
      {
        RendererTokenStream* renderer = new RendererTokenStream();
        return renderer;
      }
      unsigned int priority() const 
      {
        return 10;
      }
    };
    class TypeEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "tokenstream_emulated";
      }
      Renderer* create() const
      {
        RendererTokenStream* renderer = new RendererTokenStream();
        renderer->m_emulate = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 10;
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);

  private:

    static const size_t bufferSize = 1024*16;

    std::vector<DrawItem>       m_drawItems;

    size_t GenerateTokens(NVPointerStream& tokenStream, std::vector<DrawItem>& drawItems, size_t from, ShadeType shade, const CadScene* NV_RESTRICT scene, const Resources& resources )
    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      bool lastSolid   = true;

      ShadeCommand& sc = m_shades[shade];
      sc.fbos.clear();
      sc.offsets.clear();
      sc.sizes.clear();
      sc.states.clear();

      size_t begin = 0;

      {
        NVTokenUbo ubo;
        ubo.cmd.index   = UBO_SCENE;
        ubo.cmd.stage   = UBOSTAGE_VERTEX;
        ubo.setBuffer(resources.sceneUbo, resources.sceneAddr, 0, sizeof(SceneData));
        nvtokenEnqueue(tokenStream, ubo);

        ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
        nvtokenEnqueue(tokenStream, ubo);

#if USE_POLYOFFSETTOKEN
        NVTokenPolygonOffset offset;
        offset.cmd.bias = 1;
        offset.cmd.scale = 1;
        nvtokenEnqueue(tokenStream, offset);
#endif
      }

      size_t i = from;
      for (; i < drawItems.size(); i++){
        const DrawItem& di = drawItems[i];

        if (tokenStream.size() + sizeof(NVTokenIbo) + sizeof(NVTokenVbo) + sizeof(NVTokenUbo)*2 + sizeof(NVTokenDrawElemsUsed) > tokenStream.capacity()){
          break;
        }

        if (shade == SHADE_SOLID && !di.solid){
          continue;
        }

        if ((shade == SHADE_SOLIDWIRE || shade == SHADE_SOLIDWIRE_SPLIT) && di.solid != lastSolid){
          sc.offsets.push_back( begin );
          sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
          sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
          if ( shade == SHADE_SOLIDWIRE_SPLIT ){
            sc.fbos.   push_back( USE_STATEFBO_SPLIT ? 0 : ( di.solid ? resources.fbo : resources.fbo2  ) );
          }
          else{
            sc.fbos.push_back(0);
          }


          begin = tokenStream.size();
        }

        if (lastGeometry != di.geometryIndex){
          const CadScene::Geometry &geo = scene->m_geometry[di.geometryIndex];
          NVTokenVbo vbo;
          vbo.cmd.index = 0;
          vbo.setBuffer(geo.vboGL, geo.vboADDR, 0);
          nvtokenEnqueue(tokenStream, vbo);

          NVTokenIbo ibo;
          ibo.setBuffer(geo.iboGL, geo.iboADDR);
          ibo.cmd.typeSizeInByte = 4;
          nvtokenEnqueue(tokenStream, ibo);

          lastGeometry = di.geometryIndex;
        }

        if (lastMatrix != di.matrixIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATRIX;
          ubo.cmd.stage   = UBOSTAGE_VERTEX;
          ubo.setBuffer(scene->m_matricesGL, scene->m_matricesADDR, sizeof(CadScene::MatrixNode) * di.matrixIndex, sizeof(CadScene::MatrixNode));
          nvtokenEnqueue(tokenStream, ubo);

          lastMatrix = di.matrixIndex;
        }

        if (lastMaterial != di.materialIndex){

          NVTokenUbo ubo;
          ubo.cmd.index   = UBO_MATERIAL;
          ubo.cmd.stage   = UBOSTAGE_FRAGMENT;
          ubo.setBuffer(scene->m_materialsGL, scene->m_materialsADDR, sizeof(CadScene::Material) * di.materialIndex, sizeof(CadScene::Material));
          nvtokenEnqueue(tokenStream, ubo);

          lastMaterial = di.materialIndex;
        }


        NVTokenDrawElemsUsed drawelems;
        drawelems.setMode(di.solid ? GL_TRIANGLES : GL_LINES);
        drawelems.cmd.count = di.range.count;
        drawelems.cmd.firstIndex = GLuint((di.range.offset )/sizeof(GLuint));
        nvtokenEnqueue(tokenStream, drawelems);

        lastSolid = di.solid;
      }

      sc.offsets.push_back( begin );
      sc.sizes.  push_back( GLsizei((tokenStream.size()-begin)) );
      if (shade == SHADE_SOLID){
        sc.states. push_back( m_stateObjects[ STATE_TRIS ] );
      }
      else{
        sc.states. push_back( m_stateObjects[ lastSolid ? STATE_TRISOFFSET : STATE_LINES ] );
      }
      if ( shade == SHADE_SOLIDWIRE_SPLIT ){
        sc.fbos.   push_back( USE_STATEFBO_SPLIT ? 0 : ( lastSolid ? resources.fbo : resources.fbo2  ) );
      }
      else{
        sc.fbos.push_back(0);
      }

      return i;
    }

  };

  static RendererTokenStream::Type s_sorttoken;
  static RendererTokenStream::TypeEmu s_sorttoken_emu;

  void RendererTokenStream::init(const CadScene* NV_RESTRICT scene, const Resources& resources)
  {
    TokenRendererBase::init(s_bindless_ubo, !!has_GL_NV_vertex_buffer_unified_memory);
    resources.usingUboProgram(true);

    m_scene = scene;

    fillDrawItems(m_drawItems,0,scene->m_objects.size(), true, true);

    TokenRendererBase::finalize(resources,false);

    for (int i = 0; i < NUM_SHADES; i++){
      m_tokenStreams[i].resize(bufferSize);
      glNamedBufferData(m_tokenBuffers[i], bufferSize, 0, GL_DYNAMIC_DRAW);
    }
  }

  void RendererTokenStream::deinit()
  {
    TokenRendererBase::deinit();
    m_drawItems.clear();
  }

  void RendererTokenStream::draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager)
  {
    const CadScene* NV_RESTRICT scene = m_scene;

    // do state setup (primarily for sake of state capturing)
    scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    if (m_bindlessVboUbo){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }
    else{
      glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,resources.sceneUbo);
    }

    captureState(resources);

    if (!USE_POLYOFFSETTOKEN && (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT)){
      glPolygonOffset(1,1);
    }

    bool useSub = true;
    bool usePersistent = false;

    size_t begin = 0;
    while (begin < m_drawItems.size())
    {
      NVPointerStream stream;
      GLuint buffer;

      void* bufferPtr = NULL;
      if (m_hwsupport && !useSub){
        if (usePersistent){
          // not ideal, best would be finding max frame usage and then keep * 4 the size to account for driver/gpu
          // race
          glCreateBuffers(1,&buffer);
          glNamedBufferStorage(buffer, bufferSize, NULL, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_CLIENT_STORAGE_BIT);
          bufferPtr = glMapNamedBufferRange(buffer, 0, bufferSize, GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
        }
        else{
          buffer = m_tokenBuffers[shadetype];
          bufferPtr = glMapNamedBufferRange(buffer, 0, bufferSize, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
        }
      }
      else{
        bufferPtr = &m_tokenStreams[shadetype][0];
      }

      stream.init(bufferPtr,bufferSize);

      {
        nvh::Profiler::Section _tempTimer(profiler ,"Token");
        begin = GenerateTokens(stream, m_drawItems, begin, shadetype, scene, resources);
      }

      if (useSub){
        buffer = m_tokenBuffers[shadetype];

        nvh::Profiler::Section _tempTimer(profiler ,"Send");
        glInvalidateBufferData(buffer);
        glNamedBufferSubData(buffer,0,stream.size(), stream.m_begin);
      }

      {
        nvh::Profiler::Section _tempTimer(profiler ,"Draw");
        if (m_hwsupport){
          ShadeCommand & shade =  m_shades[shadetype];
          glDrawCommandsStatesNV(buffer, &shade.offsets[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], int(shade.states.size()) );
        }
        else{
          ShadeCommand & shade =  m_shades[shadetype];
          renderShadeCommandSW(stream.m_begin, stream.size(), shade);
        }
      }
      
      if (m_hwsupport && !useSub){
        if (usePersistent){
          glDeleteBuffers(1,&buffer);
        }
        else{
          glUnmapNamedBuffer(buffer);
        }
      }
    }

    profiler.accumulationSplit();

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    glDisable(GL_POLYGON_OFFSET_FILL);
    glPolygonOffset(0,0);

    if (m_bindlessVboUbo){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
    }

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);
  }

}


================================================
FILE: rendereruborange.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include <assert.h>
#include <algorithm>
#include "renderer.hpp"

#include "common.h"

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

  class RendererUboRange: public Renderer {
  public:
    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "uborange";
      }
      Renderer* create() const
      {
        RendererUboRange* renderer = new RendererUboRange();
        return renderer;
      }
      unsigned int priority() const 
      {
        return 0;
      }
    };
    class TypeEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "uborange_bindless";
      }
      Renderer* create() const
      {
        RendererUboRange* renderer = new RendererUboRange();
        renderer->m_vbum = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 0;
      }
    };
    class TypeSort : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "uborange_sorted";
      }
      Renderer* create() const
      {
        RendererUboRange* renderer = new RendererUboRange();
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 1;
      }
    };
    class TypeSortEmu : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "uborange_sorted_bindless";
      }
      Renderer* create() const
      {
        RendererUboRange* renderer = new RendererUboRange();
        renderer->m_vbum = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 1;
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);

    RendererUboRange()
      : m_vbum(false)
      , m_sort(false)
    {

    }

    bool                        m_vbum;
    bool                        m_sort;

  private:

    std::vector<DrawItem>       m_drawItems;

  };
  static RendererUboRange::Type         s_uborange;
  static RendererUboRange::TypeEmu      s_uborange_emu;

  static RendererUboRange::TypeSort     s_sortuborange;
  static RendererUboRange::TypeSortEmu  s_sortuborange_emu;

  void RendererUboRange::init(const CadScene* NV_RESTRICT scene, const Resources& resources)
  {
    m_scene = scene;

    fillDrawItems(m_drawItems,0,scene->m_objects.size(), true, true);

    if (m_sort){
      std::sort(m_drawItems.begin(),m_drawItems.end(),DrawItem_compare_groups);
    }
  }

  void RendererUboRange::deinit()
  {
    m_drawItems.clear();
  }

  void RendererUboRange::draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager)
  {
    const CadScene* NV_RESTRICT scene = m_scene;

    bool vbum = m_vbum;

    scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    if (vbum){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      if (s_bindless_ubo){
        glEnableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
        glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_SCENE,resources.sceneAddr,sizeof(SceneData));
      }
      else{
        glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,resources.sceneUbo);
      }
    }
    else{
      glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,resources.sceneUbo);
    }

    glUseProgram(resources.programUbo);

    SetWireMode(GL_FALSE);

    if (shadetype == SHADE_SOLIDWIRE || shadetype == SHADE_SOLIDWIRE_SPLIT){
      glEnable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(1,1);
    }

    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      bool lastSolid   = true;

      GLenum mode = GL_TRIANGLES;

      for (int i = 0; i < m_drawItems.size(); i++){
        const DrawItem& di = m_drawItems[i];

        if (shadetype == SHADE_SOLID && !di.solid){
          if (m_sort) break;
          continue;
        }

        if (lastSolid != di.solid){
          SetWireMode( di.solid ? GL_FALSE : GL_TRUE );
          if (shadetype == SHADE_SOLIDWIRE_SPLIT){
            glBindFramebuffer(GL_FRAMEBUFFER, di.solid ? resources.fbo : resources.fbo2);
          }
        }

        if (lastGeometry != di.geometryIndex){
          const CadScene::Geometry &geo = scene->m_geometry[di.geometryIndex];

          if (vbum){
            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0,  geo.vboADDR, geo.numVertices * sizeof(CadScene::Vertex));
            glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV,0,         geo.iboADDR, (geo.numIndexSolid+geo.numIndexWire) * sizeof(GLuint));
          }
          else{
            glBindVertexBuffer(0, geo.vboGL, 0, sizeof(CadScene::Vertex));
            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, geo.iboGL);
          }

          lastGeometry = di.geometryIndex;
        }

        if (lastMatrix != di.matrixIndex){

          if (vbum && s_bindless_ubo){
            glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_MATRIX, scene->m_matricesADDR + sizeof(CadScene::MatrixNode) * di.matrixIndex, sizeof(CadScene::MatrixNode));
          }
          else{
            glBindBufferRange(GL_UNIFORM_BUFFER,UBO_MATRIX, scene->m_matricesGL, sizeof(CadScene::MatrixNode) * di.matrixIndex, sizeof(CadScene::MatrixNode));
          }

          lastMatrix = di.matrixIndex;
        }

        if (lastMaterial != di.materialIndex){

          if (m_vbum && s_bindless_ubo){
            glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_MATERIAL, scene->m_materialsADDR +sizeof(CadScene::Material) * di.materialIndex, sizeof(CadScene::Material));
          }
          else{
            glBindBufferRange(GL_UNIFORM_BUFFER,UBO_MATERIAL, scene->m_materialsGL, sizeof(CadScene::Material) * di.materialIndex, sizeof(CadScene::Material));
          }

          lastMaterial = di.materialIndex;
        }

        glDrawElements( di.solid ? GL_TRIANGLES : GL_LINES, di.range.count, GL_UNSIGNED_INT, (void*) di.range.offset);

        lastSolid = di.solid;
      }
    }

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    glDisable(GL_POLYGON_OFFSET_FILL);
    glPolygonOffset(0,0);

    if (vbum){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
      if (s_bindless_ubo){
        glDisableClientState(GL_UNIFORM_BUFFER_UNIFIED_NV);
      }
    }

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);
  }

}


================================================
FILE: rendererubosub.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include <assert.h>
#include <algorithm>
#include "renderer.hpp"

#include "common.h"

namespace csfviewer
{
  //////////////////////////////////////////////////////////////////////////

  class RendererUboSub: public Renderer {
  public:
    class Type : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "ubosub";
      }
      Renderer* create() const
      {
        RendererUboSub* renderer = new RendererUboSub();
        return renderer;
      }
      unsigned int priority() const 
      {
        return 2;
      }
    };
    class TypeVbum : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "ubosub_bindless";
      }
      Renderer* create() const
      {
        RendererUboSub* renderer = new RendererUboSub();
        renderer->m_vbum = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 2;
      }
    };
    class TypeSort : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return true;
      }
      const char* name() const
      {
        return "ubosub_sorted";
      }
      Renderer* create() const
      {
        RendererUboSub* renderer = new RendererUboSub();
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 2;
      }
    };
    class TypeSortVbum : public Renderer::Type 
    {
      bool isAvailable() const
      {
        return !!has_GL_NV_vertex_buffer_unified_memory;
      }
      const char* name() const
      {
        return "ubosub_sorted_bindless";
      }
      Renderer* create() const
      {
        RendererUboSub* renderer = new RendererUboSub();
        renderer->m_vbum = true;
        renderer->m_sort = true;
        return renderer;
      }
      unsigned int priority() const 
      {
        return 2;
      }
    };

  public:
    void init(const CadScene* NV_RESTRICT scene, const Resources& resources);
    void deinit();
    void draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager);

    bool                        m_sort;
    bool                        m_vbum;

  private:

    std::vector<DrawItem>       m_drawItems;

    GLuint                      m_streamMatrix;
    GLuint                      m_streamMaterial;

    RendererUboSub()
      : m_vbum(false)
      , m_sort(false)
    {

    }

  };

  static RendererUboSub::Type s_ubosub;
  static RendererUboSub::TypeVbum s_ubosub_vbum;
  static RendererUboSub::TypeSort s_ubosub_sort;
  static RendererUboSub::TypeSortVbum s_ubosub_vbum_sort;

  void RendererUboSub::init(const CadScene* NV_RESTRICT scene, const Resources& resources)
  {
    resources.usingUboProgram(true);
    m_scene = scene;

    fillDrawItems(m_drawItems,0,scene->m_objects.size(), true, true);

    if (m_sort){
      std::sort(m_drawItems.begin(),m_drawItems.end(),DrawItem_compare_groups);
    }

    m_scene = scene;
    glCreateBuffers(1,&m_streamMatrix);
    glCreateBuffers(1,&m_streamMaterial);
    glNamedBufferData( m_streamMatrix, sizeof(CadScene::MatrixNode), NULL, GL_STREAM_DRAW);
    glNamedBufferData( m_streamMaterial, sizeof(CadScene::Material), NULL, GL_STREAM_DRAW);
  }

  void RendererUboSub::deinit()
  {
    glDeleteBuffers(1,&m_streamMatrix);
    glDeleteBuffers(1,&m_streamMaterial);
  }

  void RendererUboSub::draw(ShadeType shadetype, const Resources& resources, nvh::Profiler& profiler, nvgl::ProgramManager &progManager)
  {
    const CadScene* NV_RESTRICT scene = m_scene;

    bool vbum = m_vbum;

    scene->enableVertexFormat(VERTEX_POS,VERTEX_NORMAL);

    glUseProgram(resources.programUbo);

    SetWireMode(GL_FALSE);

    if (shadetype == SHADE_SOLIDWIRE){
      glEnable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(1,1);
    }

    if (vbum){
      glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
    }

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE,     resources.sceneUbo);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX,    m_streamMatrix);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL,  m_streamMaterial);

    {
      int lastMaterial = -1;
      int lastGeometry = -1;
      int lastMatrix   = -1;
      bool lastSolid   = true;

      GLenum mode = GL_TRIANGLES;

      for (int i = 0; i < m_drawItems.size(); i++){
        const DrawItem& di = m_drawItems[i];

        if (shadetype == SHADE_SOLID && !di.solid){
          if (m_sort) break;
          continue;
        }

        if (lastSolid != di.solid){
          SetWireMode( di.solid ? GL_FALSE : GL_TRUE );
          if (shadetype == SHADE_SOLIDWIRE_SPLIT){
            glBindFramebuffer(GL_FRAMEBUFFER, di.solid ? resources.fbo : resources.fbo2);
          }
        }

        if (lastGeometry != di.geometryIndex){
          const CadScene::Geometry &geo = scene->m_geometry[di.geometryIndex];

          if (vbum){
            glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, 0,  geo.vboADDR, geo.numVertices * sizeof(CadScene::Vertex));
            glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV,0,         geo.iboADDR, (geo.numIndexSolid+geo.numIndexWire) * sizeof(GLuint));
          }
          else{
            glBindVertexBuffer(0, geo.vboGL, 0, sizeof(CadScene::Vertex));
            glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, geo.iboGL);
          }

          lastGeometry = di.geometryIndex;
        }

        if (lastMatrix != di.matrixIndex){
          glNamedBufferSubData(m_streamMatrix, 0, sizeof(CadScene::MatrixNode), &scene->m_matrices[di.matrixIndex]);
          lastMatrix = di.matrixIndex;
        }

        if (lastMaterial != di.materialIndex){
          glNamedBufferSubData(m_streamMaterial, 0, sizeof(CadScene::Material), &scene->m_materials[di.materialIndex]);
          lastMaterial = di.materialIndex;
        }

        glDrawElements( di.solid ? GL_TRIANGLES : GL_LINES, di.range.count, GL_UNSIGNED_INT, (void*) di.range.offset);

        lastSolid = di.solid;
      }
    }

    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_SCENE, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATRIX, 0);
    glBindBufferBase(GL_UNIFORM_BUFFER,UBO_MATERIAL, 0);

    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
    glBindVertexBuffer(0,0,0,0);

    if (m_vbum){
      glDisableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
      glDisableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
    }

    if (shadetype == SHADE_SOLIDWIRE){
      glDisable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(0,0);
    }

    scene->disableVertexFormat(VERTEX_POS,VERTEX_NORMAL);
  }

}


================================================
FILE: scan.comp.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#define TASK_SUM      0
#define TASK_OFFSETS  1
#define TASK_COMBINE  2

#ifndef TASK
#define TASK TASK_SUM
#endif

#define THREADBLOCK_SIZE  512
#define BATCH_SIZE        (THREADBLOCK_SIZE*4)


uniform uint numElements;

///////////////////////////////////////////////////////
// based on CUDA Sample "scan.cu" 

layout (local_size_x = THREADBLOCK_SIZE) in;

#if TASK != TASK_COMBINE

uint threadIdx = gl_LocalInvocationID.x;

#extension GL_NV_shader_thread_group : enable
#extension GL_NV_shader_thread_shuffle : enable

#if GL_NV_shader_thread_group

#define USESHUFFLE
#define LOG2_WARP_SIZE 5U
#define      WARP_SIZE (1U << LOG2_WARP_SIZE)

// Almost the same as naive scan1Inclusive but doesn't need barriers
// nor shared memory
// and works only for size <= WARP_SIZE

#if GL_NV_shader_thread_shuffle

shared uint s_Data[(THREADBLOCK_SIZE / WARP_SIZE)];

uint warpScanInclusive(uint idata, uint size){
  uint sum = idata;
  
  for (int STEP = 0; STEP < 5 && (1<<(STEP+1)) <= size; STEP++){
    bool valid = false;
    uint temp = shuffleUpNV(sum, 1 << STEP, 32, valid);
    if (valid) {
      sum += temp;
    }
  }

  return sum;
}

#else

shared uint s_Data[THREADBLOCK_SIZE * 2];

// Almost the same as naive scan1Inclusive but doesn't need barriers
// and works only for size <= WARP_SIZE

uint warpScanInclusive(uint idata, uint size){
  uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
  s_Data[pos] = 0;
  pos += size;
  s_Data[pos] = idata;

  if(size >=  2) s_Data[pos] += s_Data[pos -  1];
  if(size >=  4) s_Data[pos] += s_Data[pos -  2];
  if(size >=  8) s_Data[pos] += s_Data[pos -  4];
  if(size >= 16) s_Data[pos] += s_Data[pos -  8];
  if(size >= 32) s_Data[pos] += s_Data[pos - 16];

  return s_Data[pos];
}

#endif

uint warpScanExclusive(uint idata, uint size){
    return warpScanInclusive(idata, size) - idata;
}

uint scan1Inclusive(uint idata, uint size){
  if(size > WARP_SIZE){
    //Bottom-level inclusive warp scan
    uint warpResult = warpScanInclusive(idata, WARP_SIZE);

    //Save top elements of each warp for exclusive warp scan
  #if !GL_NV_shader_thread_shuffle
    //sync to wait for warp scans to complete (because l_Data is being overwritten)
    memoryBarrierShared();
    barrier();
  #endif
    if( (threadIdx & (WARP_SIZE - 1)) == (WARP_SIZE - 1) )
        s_Data[threadIdx >> LOG2_WARP_SIZE] = warpResult;

    //wait for warp scans to complete
    memoryBarrierShared();
    barrier();
    if( threadIdx < (THREADBLOCK_SIZE / WARP_SIZE) ){
        //grab top warp elements
        uint val = s_Data[threadIdx];
        //calculate exclsive scan and write back to shared memory
        s_Data[threadIdx] = warpScanExclusive(val, size >> LOG2_WARP_SIZE);
    }

    //return updated warp scans with exclusive scan results
    memoryBarrierShared();
    barrier();
    return warpResult + s_Data[threadIdx >> LOG2_WARP_SIZE];
  }else{
    return warpScanInclusive(idata, size);
  }
}

#else

shared uint s_Data[THREADBLOCK_SIZE * 2];

uint scan1Inclusive(uint idata, uint size)
{
    uint pos = 2 * threadIdx.x - (threadIdx.x & (size - 1));
    s_Data[pos] = 0;
    pos += size;
    s_Data[pos] = idata;

    for (uint offset = 1; offset < size; offset <<= 1)
    {
        memoryBarrierShared();
        barrier();
        uint t = s_Data[pos] + s_Data[pos - offset];
        memoryBarrierShared();
        barrier();
        s_Data[pos] = t;
    }

    return s_Data[pos];
}

#endif

uint scan1Exclusive(uint idata, uint size)
{
    return scan1Inclusive(idata, size) - idata;
}

uvec4 scan4Inclusive(uvec4 idata4, uint size)
{
    //Level-0 inclusive scan
    idata4.y += idata4.x;
    idata4.z += idata4.y;
    idata4.w += idata4.z;

    //Level-1 exclusive scan
    uint oval = scan1Exclusive(idata4.w, size / 4);

    idata4.x += oval;
    idata4.y += oval;
    idata4.z += oval;
    idata4.w += oval;

    return idata4;
}

//Exclusive vector scan: the array to be scanned is stored
//in local thread memory scope as uint4
uvec4 scan4Exclusive(uvec4 idata4, uint size)
{
    uvec4 odata4 = scan4Inclusive(idata4, size);
    odata4.x -= idata4.x;
    odata4.y -= idata4.y;
    odata4.z -= idata4.z;
    odata4.w -= idata4.w;
    return odata4;
}

#endif


#if TASK == TASK_SUM

layout (std430, binding=1) buffer inputBuffer {
  uvec4 indata[];
};

layout (std430, binding=0) buffer outputBuffer {
  uvec4 outdata[];
};

void main()
{
  uint idx = gl_GlobalInvocationID.x;
  uint maxidx = ((numElements + 3) / 4);
  
  bool valid = idx < maxidx;

  //Load data
  uvec4 idata4 = valid ? indata[idx] : uvec4(0);

  // Calculate scan
  //uvec4 odata4 = scan4Inclusive(idata4, min(BATCH_SIZE,  (maxidx-idx)*4));
  uvec4 odata4 = scan4Inclusive(idata4, BATCH_SIZE);

  //Write back
  if (valid) outdata[idx] = odata4;
}
#endif

#if TASK == TASK_OFFSETS

layout (std430, binding=1) buffer inputBuffer {
  uint indata[];
};

layout (std430, binding=0) buffer outputBuffer {
  uvec4 outdata[];
};

void main()
{
  uint idx = gl_GlobalInvocationID.x;
  uint startIdx = (idx * BATCH_SIZE * 4);
  
  bool valid = false;
  
  //Load data
  uvec4 idata4 = uvec4(0);
  for (uint i = 0; i < 4; i++){
    uint readIdx = startIdx + (i+1)*BATCH_SIZE - 1u;
    if ( readIdx < numElements ){
      idata4[i] = indata[readIdx];
      valid = true;
    }
  }

  //Calculate scan
  uvec4 odata4 = scan4Inclusive(idata4, BATCH_SIZE);

  //Write back
  if (valid) outdata[idx] = odata4;
}
#endif

#if TASK == TASK_COMBINE

layout (std430, binding=1) buffer inputBuffer {
  uint indata[];
};

layout (std430, binding=0) buffer outputBuffer {
  uint outdata[];
};

void main()
{
  uint idx = gl_GlobalInvocationID.x;
  
  bool valid = idx < numElements;
  uint batch = idx / BATCH_SIZE;
  
  if (valid && batch > 0) {
    outdata[idx] += indata[batch-1];
  }
}
#endif


================================================
FILE: scansystem.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "scansystem.hpp"
#include <assert.h>

inline static GLuint snapdiv(GLuint input, GLuint align)
{
  return (input + align - 1) / align;
}

size_t ScanSystem::getOffsetSize(GLuint elements)
{
  GLuint groups = snapdiv(elements,BATCH_ELEMENTS);

  if (groups == 1) return 0;

  GLuint groupcombines = snapdiv(groups,BATCH_ELEMENTS);
  size_t size = groupcombines*BATCH_ELEMENTS*sizeof(GLuint);
  
  if (groupcombines > 1){
    // add another layer
    GLuint combines = snapdiv(groupcombines,BATCH_ELEMENTS);
    size += combines*BATCH_ELEMENTS*sizeof(GLuint);
  }

  return size;
}

bool ScanSystem::scanData( GLuint elements, const Buffer& input, const Buffer& output, const Buffer& offsets )
{
  assert( (elements % 4) == 0 );
  assert( elements < (GLuint64)BATCH_ELEMENTS*BATCH_ELEMENTS*BATCH_ELEMENTS);
  assert( elements * sizeof(GLuint) <= size_t(input.size) );
  assert( input.size <= output.size );

  glUseProgram(programs.prefixsum);
  glUniform1ui(0,elements);

  input.BindBufferRange(GL_SHADER_STORAGE_BUFFER,1);
  output.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);

  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

  GLuint groups = snapdiv(elements,BATCH_ELEMENTS);

  assert(groups <= maxGrpsPrefix);
  glDispatchCompute(groups,1,1);

  if (groups > 1){

    GLuint groupcombines = snapdiv(groups,BATCH_ELEMENTS);

    assert( groupcombines <= BATCH_ELEMENTS );
    assert( getOffsetSize(elements) <= size_t(offsets.size));
        
    glUseProgram(programs.offsets);
    glUniform1ui(0,elements);

    output.BindBufferRange(GL_SHADER_STORAGE_BUFFER,  1);
    offsets.BindBufferRange(GL_SHADER_STORAGE_BUFFER, 0);

    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

    assert(groupcombines <= maxGrpsOffsets);
    glDispatchCompute(groupcombines,1,1);

    if (groupcombines > 1){
      glUniform1ui(0,groupcombines*BATCH_ELEMENTS);

      Buffer additionaloffsets = offsets; // derive from offsets
      GLintptr required = groupcombines*BATCH_ELEMENTS*sizeof(GLuint);;

      additionaloffsets.offset += required;
      additionaloffsets.size = offsets.size - required;

      offsets.BindBufferRange(GL_SHADER_STORAGE_BUFFER,1);
      additionaloffsets.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);

      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

      glDispatchCompute(1,1,1);

      combineWithOffsets(groupcombines*BATCH_ELEMENTS, offsets, additionaloffsets);
    }
  }

  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,0,0);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,0,1);
  
  return groups > 1;
}

void ScanSystem::combineWithOffsets(GLuint elements, const Buffer& output, const Buffer& offsets )
{
  //assert((elements % 4) == 0);
  assert(elements * sizeof(GLuint) <= size_t(output.size));

  glUseProgram(programs.combine);
  glUniform1ui(0,elements);

  offsets.BindBufferRange(GL_SHADER_STORAGE_BUFFER, 1);
  output.BindBufferRange(GL_SHADER_STORAGE_BUFFER, 0);

  glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);

  GLuint groups = snapdiv(elements,GROUPSIZE);
  assert(groups < maxGrpsCombine);
  glDispatchCompute(groups,1,1);
}

void ScanSystem::init( const Programs& progs )
{
  update(progs);
}

void ScanSystem::update( const Programs& progs )
{
  GLuint    maxGroups[3];
  glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT,0,(GLint*)&maxGroups[0]);

  //GLuint    groupSize[3];
  //glGetProgramiv(progs.combine,    GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupSize);
  maxGrpsCombine = maxGroups[0];
  //glGetProgramiv(progs.offsets,    GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupSize);
  maxGrpsOffsets = maxGroups[0];
  //glGetProgramiv(progs.prefixsum,    GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupSize);
  maxGrpsPrefix = maxGroups[0];

  programs = progs;
}

void ScanSystem::test()
{
  GLuint scanbuffers[3];
  glCreateBuffers(3,scanbuffers);

  GLuint low  = ScanSystem::BATCH_ELEMENTS/2;
  GLuint mid  = ScanSystem::BATCH_ELEMENTS*ScanSystem::BATCH_ELEMENTS;
  GLuint high = ScanSystem::BATCH_ELEMENTS*ScanSystem::BATCH_ELEMENTS*2;
  size_t offsize = ScanSystem::getOffsetSize(high);

  GLuint* data = new GLuint[high];
  for (GLuint i = 0; i < high; i++){
    data[i] = 1;
  }

  glNamedBufferStorage(scanbuffers[0], high * sizeof(GLuint), &data[0], 0 );
  glNamedBufferStorage(scanbuffers[1], high * sizeof(GLuint),0, GL_MAP_READ_BIT );
  glNamedBufferStorage(scanbuffers[2], offsize,0,GL_MAP_READ_BIT);

  delete [] data;

  GLuint result;
  bool needcombine;

  // low
  needcombine = scanData(low, scanbuffers[0], scanbuffers[1], scanbuffers[2]);
  assert(needcombine == false);
  result = 0;
  glGetNamedBufferSubData(scanbuffers[1],sizeof(GLuint) * (low-1), sizeof(GLuint), &result);
  assert(result == low);

  // med
  needcombine = scanData(mid, scanbuffers[0], scanbuffers[1], scanbuffers[2]);
  assert(needcombine == true);
  result = 0;
  glGetNamedBufferSubData(scanbuffers[2],sizeof(GLuint) * (ScanSystem::BATCH_ELEMENTS-1), sizeof(GLuint), &result);
  assert(result == mid);

  combineWithOffsets(mid, scanbuffers[1], scanbuffers[2]);
  result = 0;
  glGetNamedBufferSubData(scanbuffers[1],sizeof(GLuint) * (mid-1), sizeof(GLuint), &result);
  assert(result == mid);

  // high
  needcombine = scanData(high, scanbuffers[0], scanbuffers[1], scanbuffers[2]);
  assert(needcombine == true);
  combineWithOffsets(high, scanbuffers[1], scanbuffers[2]);
  result = 0;
  glGetNamedBufferSubData(scanbuffers[1],sizeof(GLuint) * (high-1), sizeof(GLuint), &result);
  assert(result == high);

  glDeleteBuffers(3,scanbuffers);
}


================================================
FILE: scansystem.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#ifndef SCANSYSTEM_H__
#define SCANSYSTEM_H__

#include <nvgl/extensions_gl.hpp>
#include <cstddef>

class ScanSystem {
public:
  const static size_t GROUPSIZE = 512;
  const static size_t BATCH_ELEMENTS = GROUPSIZE*4;

  struct Programs {
    GLuint prefixsum;
    GLuint offsets;
    GLuint combine;
  };

  struct Buffer {
    GLuint      buffer;
    GLintptr    offset;
    GLsizeiptr  size;

    void create(size_t sizei, const void* data, GLbitfield flags)
    {
      size = sizei;
      offset = 0;
      glCreateBuffers(1,&buffer);
      glNamedBufferStorage(buffer, size, data, flags);
    }

    Buffer(GLuint buffer)
      : buffer(buffer)
      , offset(0)
    {
      if (sizeof(GLsizeiptr) > 4)
        glGetNamedBufferParameteri64v(buffer,GL_BUFFER_SIZE, (GLint64*)&size);
      else
        glGetNamedBufferParameteriv(buffer, GL_BUFFER_SIZE, (GLint*)&size);
    }

    Buffer()
      : buffer(0)
      , offset(0)
      , size(0)
    {

    }

    inline void BindBufferRange(GLenum target, GLuint index) const {
      glBindBufferRange(target, index, buffer, offset, size);
    }
    inline void BindBufferRange(GLenum target, GLuint index, GLintptr offseta, GLsizeiptr sizea) const {
      glBindBufferRange(target, index, buffer, offset+offseta, size+sizea);
    }

    inline void GetNamedBufferSubData(void* data){
      glGetNamedBufferSubData(buffer,offset,size,data);
    }

  };

  void init(const Programs& progs);
  void update(const Programs& progs);

  void test();

  // returns true if offsets are needed
  // the offset value needs to be added using the BATCH_ELEMENTS
  bool scanData( GLuint elements, const Buffer& input, const Buffer& output, const Buffer& offsets);
  void combineWithOffsets(GLuint elements, const Buffer& output, const Buffer& offsets);

  static size_t getOffsetSize(GLuint elements);

public:
  Programs    programs;

  GLuint      maxGrpsPrefix;
  GLuint      maxGrpsOffsets;
  GLuint      maxGrpsCombine;
 };

#endif
 

================================================
FILE: scene.frag.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#extension GL_ARB_shading_language_include : enable
#include "common.h"

// must match cadscene
struct Side {
  vec4 ambient;
  vec4 diffuse;
  vec4 specular;
  vec4 emissive;
};

struct Material {
  Side  sides[2];
  Side  _pad[2];
};

layout(std140,binding=UBO_MATERIAL) uniform materialBuffer {
#if USE_INDEXING
  Material  materials[256];
#else
  Material  materials[1];
#endif
};


in Interpolants {
  vec3 wPos;
  vec3 wNormal;
#if USE_INDEXING
  flat ivec2 assigns;
#endif
#if !defined(WIREMODE)
  flat int wireMode;
#endif
} IN;


#if !defined(WIREMODE)
int wireMode = IN.wireMode;
#else
int wireMode = WIREMODE;
#endif

layout(location=0,index=0) out vec4 out_Color;

vec4 shade(const Side side)
{
  vec4 color = side.ambient + side.emissive;
  
  vec3 eyePos = vec3(scene.viewMatrixIT[0].w,scene.viewMatrixIT[1].w,scene.viewMatrixIT[2].w);

  vec3 lightDir = normalize( scene.wLightPos.xyz - IN.wPos);
  vec3 viewDir  = normalize( eyePos - IN.wPos);
  vec3 halfDir  = normalize(lightDir + viewDir);
  vec3 normal   = normalize(IN.wNormal) * (gl_FrontFacing ? 1 : -1);
  
  color += side.diffuse * max(dot(normal,lightDir),0);
  color += side.specular * pow(max(0,dot(normal,halfDir)),16);
  
  return color;
}

void main()
{
  int mi = 0;
#if USE_INDEXING
  mi = IN.assigns.y;
#endif

  out_Color = shade(materials[mi].sides[gl_FrontFacing ? 1 : 0]);

  if (wireMode != 0){
    out_Color = materials[mi].sides[0].diffuse*1.5 + 0.3;
  }
}


================================================
FILE: scene.vert.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#extension GL_ARB_shading_language_include : enable
#include "common.h"

#if USE_INDEXING && USE_BASEINSTANCE
#extension GL_ARB_shader_draw_parameters : require
#endif
in layout(location=VERTEX_POS)      vec3 pos;
in layout(location=VERTEX_NORMAL)   vec3 normal;

#if USE_INDEXING
#if USE_BASEINSTANCE
ivec2 assigns = ivec2( gl_BaseInstanceARB & 0xFFFFF, gl_BaseInstanceARB >> 20);
#else
in layout(location=VERTEX_ASSIGNS)  ivec2 assigns;
#endif
#define matrixIndex assigns.x
#endif

#if !defined(WIREMODE)
in layout(location=VERTEX_WIREMODE) int wireMode;
#endif

out Interpolants {
  vec3 wPos;
  vec3 wNormal;
#if USE_INDEXING
  flat ivec2 assigns;
#endif
#if !defined(WIREMODE)
  flat int wireMode;
#endif
} OUT;


void main()
{
#if USE_INDEXING || USE_MIX
  vec3 wPos     = (getIndexedMatrix(matrixIndex, NODE_MATRIX_WORLD)   * vec4(pos,1)).xyz;
  vec3 wNormal  = mat3(getIndexedMatrix(matrixIndex, NODE_MATRIX_WORLDIT)) * normal;
#else
  vec3 wPos     = (object.worldMatrix   * vec4(pos,1)).xyz;
  vec3 wNormal  = mat3(object.worldMatrixIT) * normal;
#endif
  gl_Position   = scene.viewProjMatrix * vec4(wPos,1);
  OUT.wPos = wPos;
  OUT.wNormal = wNormal;
#if USE_INDEXING
  OUT.assigns = assigns;
#endif
#if !defined(WIREMODE)
  OUT.wireMode = wireMode;
#endif
}


================================================
FILE: statesystem.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "statesystem.hpp"
#include <string.h> // memcmp

//////////////////////////////////////////////////////////////////////////

void StateSystem::ClipDistanceState::applyGL() const
{
  for (GLuint i = 0; i < MAX_CLIPPLANES; i++){
    if (isBitSet(enabled,i))  glEnable  (GL_CLIP_DISTANCE0 + i);
    else                      glDisable (GL_CLIP_DISTANCE0 + i);
  }
}

void StateSystem::ClipDistanceState::getGL()
{
  enabled = 0;
  for (GLuint i = 0; i < MAX_CLIPPLANES; i++){
    setBitState(enabled,i,glIsEnabled(GL_CLIP_DISTANCE0 + i));
  }
}

//////////////////////////////////////////////////////////////////////////

#if STATESYSTEM_USE_DEPRECATED
void StateSystem::AlphaStateDepr::applyGL() const
{
  glAlphaFunc(mode,refvalue);
}

void StateSystem::AlphaStateDepr::getGL()
{
  glGetIntegerv(GL_ALPHA_TEST_FUNC,(GLint*)&mode);
  glGetFloatv(GL_ALPHA_TEST_REF, &refvalue);
}
#endif

//////////////////////////////////////////////////////////////////////////

void StateSystem::StencilState::applyGL() const
{
  glStencilFuncSeparate(GL_FRONT, funcs[FACE_FRONT].func, funcs[FACE_FRONT].refvalue, funcs[FACE_FRONT].mask);
  glStencilFuncSeparate(GL_BACK,  funcs[FACE_BACK ].func, funcs[FACE_BACK ].refvalue, funcs[FACE_BACK ].mask);
  glStencilOpSeparate(GL_FRONT,   ops[FACE_FRONT].fail,   ops[FACE_FRONT].zfail,      ops[FACE_FRONT].zpass);
  glStencilOpSeparate(GL_BACK,    ops[FACE_BACK ].fail,   ops[FACE_BACK ].zfail,      ops[FACE_BACK ].zpass);
}

void StateSystem::StencilState::getGL()
{
  glGetIntegerv(GL_STENCIL_FUNC,        (GLint*)&funcs[FACE_FRONT].func);
  glGetIntegerv(GL_STENCIL_REF,         (GLint*)&funcs[FACE_FRONT].refvalue);
  glGetIntegerv(GL_STENCIL_VALUE_MASK,  (GLint*)&funcs[FACE_FRONT].mask);

  glGetIntegerv(GL_STENCIL_BACK_FUNC,         (GLint*)&funcs[FACE_BACK].func);
  glGetIntegerv(GL_STENCIL_BACK_REF,          (GLint*)&funcs[FACE_BACK].refvalue);
  glGetIntegerv(GL_STENCIL_BACK_VALUE_MASK,   (GLint*)&funcs[FACE_BACK].mask);

  glGetIntegerv(GL_STENCIL_FAIL,              (GLint*)&ops[FACE_FRONT].fail);
  glGetIntegerv(GL_STENCIL_PASS_DEPTH_FAIL,   (GLint*)&ops[FACE_FRONT].zfail);
  glGetIntegerv(GL_STENCIL_PASS_DEPTH_PASS,   (GLint*)&ops[FACE_FRONT].zpass);

  glGetIntegerv(GL_STENCIL_BACK_FAIL,             (GLint*)&ops[FACE_BACK].fail);
  glGetIntegerv(GL_STENCIL_BACK_PASS_DEPTH_FAIL,  (GLint*)&ops[FACE_BACK].zfail);
  glGetIntegerv(GL_STENCIL_BACK_PASS_DEPTH_PASS,  (GLint*)&ops[FACE_BACK].zpass);
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::BlendState::applyGL() const
{
  if (separateEnable){
    for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
      if (isBitSet(separateEnable,i)) glEnablei(GL_BLEND,i);
      else                            glDisablei(GL_BLEND,i);
    }
  }

  if (useSeparate){
    for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
      glBlendFuncSeparatei(i,blends[i].rgb.srcw,blends[i].rgb.dstw,blends[i].alpha.srcw,blends[i].alpha.dstw);
      glBlendEquationSeparatei(i,blends[i].rgb.equ,blends[i].alpha.equ);
    }
  }
  else{
    glBlendFuncSeparate(blends[0].rgb.srcw,blends[0].rgb.dstw,blends[0].alpha.srcw,blends[0].alpha.dstw);
    glBlendEquationSeparate(blends[0].rgb.equ,blends[0].alpha.equ);
  }

  //glBlendColor(color[0],color[1],color[2],color[3]);
}

void StateSystem::BlendState::getGL()
{
  GLuint stateSet = 0;
  separateEnable = 0;
  for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
    if (setBitState(separateEnable,i, glIsEnabledi( GL_BLEND, i))) stateSet++;
  }
  if (stateSet == MAX_DRAWBUFFERS){
    separateEnable = 0;
  }

  GLuint numEqual = 1;
  for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
    glGetIntegeri_v(GL_BLEND_SRC_RGB,i,(GLint*)&blends[i].rgb.srcw);
    glGetIntegeri_v(GL_BLEND_DST_RGB,i,(GLint*)&blends[i].rgb.dstw);
    glGetIntegeri_v(GL_BLEND_EQUATION_RGB,i,(GLint*)&blends[i].rgb.equ);

    glGetIntegeri_v(GL_BLEND_SRC_ALPHA,i,(GLint*)&blends[i].alpha.srcw);
    glGetIntegeri_v(GL_BLEND_DST_ALPHA,i,(GLint*)&blends[i].alpha.dstw);
    glGetIntegeri_v(GL_BLEND_EQUATION_ALPHA,i,(GLint*)&blends[i].alpha.equ);

    if (i > 1 && memcmp(&blends[i].rgb,&blends[i-1].rgb,sizeof(blends[i].rgb))==0 && memcmp(&blends[i].alpha,&blends[i-1].alpha,sizeof(blends[i].alpha))==0){
      numEqual++;
    }
  }

  useSeparate = numEqual != MAX_DRAWBUFFERS;

  //glGetFloatv(GL_BLEND_COLOR,color);
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::DepthState::applyGL() const
{
  glDepthFunc(func);
}

void StateSystem::DepthState::getGL()
{
  glGetIntegerv(GL_DEPTH_FUNC,(GLint*)&func);
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::LogicState::applyGL() const
{
  glLogicOp(op);
}

void StateSystem::LogicState::getGL()
{
  glGetIntegerv(GL_LOGIC_OP_MODE,(GLint*)&op);
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::RasterState::applyGL() const
{
  //glFrontFace(frontFace);
  glCullFace(cullFace);
  //glPolygonOffset(polyOffsetFactor,polyOffsetUnits);
  glPolygonMode(GL_FRONT_AND_BACK,polyMode);
  //glLineWidth(lineWidth);
  glPointSize(pointSize);
  glPointParameterf(GL_POINT_FADE_THRESHOLD_SIZE,pointFade);
  glPointParameteri(GL_POINT_SPRITE_COORD_ORIGIN,pointSpriteOrigin);
}

void StateSystem::RasterState::getGL()
{
  //glGetIntegerv(GL_FRONT_FACE, (GLint*)&frontFace);
  glGetIntegerv(GL_CULL_FACE_MODE, (GLint*)&cullFace);
  //glGetFloatv(GL_POLYGON_OFFSET_FACTOR,&polyOffsetFactor);
  //glGetFloatv(GL_POLYGON_OFFSET_UNITS,&polyOffsetUnits);
  //glGetFloatv(GL_LINE_WIDTH,&lineWidth);
  glGetFloatv(GL_POINT_SIZE,&pointSize);
  glGetFloatv(GL_POINT_FADE_THRESHOLD_SIZE,&pointFade);
  glGetIntegerv(GL_POINT_SPRITE_COORD_ORIGIN,(GLint*)&pointSpriteOrigin);
}

//////////////////////////////////////////////////////////////////////////

#if STATESYSTEM_USE_DEPRECATED
void StateSystem::RasterStateDepr::applyGL() const
{
  glLineStipple(lineStippleFactor,lineStipplePattern);
  glShadeModel(shadeModel);
}

void StateSystem::RasterStateDepr::getGL()
{
  GLint pattern;
  glGetIntegerv(GL_LINE_STIPPLE_PATTERN,&pattern);
  lineStipplePattern = pattern;
  glGetIntegerv(GL_LINE_STIPPLE_REPEAT,(GLint*)&lineStippleFactor);
  glGetIntegerv(GL_SHADE_MODEL,(GLint*)&shadeModel);
}
#endif

//////////////////////////////////////////////////////////////////////////

void StateSystem::PrimitiveState::applyGL() const
{
  glPrimitiveRestartIndex(restartIndex);
  glProvokingVertex(provokingVertex);
  glPatchParameteri(GL_PATCH_VERTICES,patchVertices);
}

void StateSystem::PrimitiveState::getGL()
{
  glGetIntegerv(GL_PRIMITIVE_RESTART_INDEX, (GLint*)&restartIndex);
  glGetIntegerv(GL_PROVOKING_VERTEX, (GLint*)&provokingVertex);
  glGetIntegerv(GL_PATCH_VERTICES, (GLint*)&patchVertices);
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::SampleState::applyGL() const
{
  glSampleCoverage(coverage,invert);
  glSampleMaski(0,mask);
}

void StateSystem::SampleState::getGL()
{
  glGetIntegerv(GL_SAMPLE_COVERAGE_VALUE,(GLint*)&coverage);
  glGetIntegerv(GL_SAMPLE_COVERAGE_INVERT,(GLint*)&invert);
  glGetIntegeri_v(GL_SAMPLE_MASK_VALUE,0,(GLint*)&mask);
}

//////////////////////////////////////////////////////////////////////////
/*
void StateSystem::ViewportState::applyGL() const
{
  if (useSeparate){
    glViewportArrayv(0,MAX_VIEWPORTS, &viewports[0].x);
  }
  else{
    glViewport(GLint(viewports[0].x),GLint(viewports[0].y),GLsizei(viewports[0].width),GLsizei(viewports[0].height));
  }
}

void StateSystem::ViewportState::getGL()
{
  int numEqual = 1;
  for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
    glGetFloati_v(GL_VIEWPORT,i,&viewports[i].x);
    if (i > 0 && memcmp(&viewports[i],&viewports[i-1],sizeof(viewports[i]))==0){
      numEqual++;
    }
  }
  
  useSeparate = (numEqual != MAX_VIEWPORTS);
}
*/
//////////////////////////////////////////////////////////////////////////

void StateSystem::DepthRangeState::applyGL() const
{
  if (useSeparate){
    glDepthRangeArrayv(0,MAX_VIEWPORTS, &depths[0].nearPlane);
  }
  else{
    glDepthRange(depths[0].nearPlane,depths[0].farPlane);
  }
}

void StateSystem::DepthRangeState::getGL()
{
  GLuint numEqual = 1;
  for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
    glGetDoublei_v(GL_DEPTH_RANGE,i,&depths[i].nearPlane);
    if (i > 0 && memcmp(&depths[i],&depths[i-1],sizeof(depths[i]))==0){
      numEqual++;
    }
  }

  useSeparate = (numEqual != MAX_VIEWPORTS);
}

//////////////////////////////////////////////////////////////////////////
/*
void StateSystem::ScissorState::applyGL() const
{
  if (useSeparate){
    glScissorArrayv(0,MAX_VIEWPORTS, &scissor[0].x);
  }
  else{
    glScissor(scissor[0].x,scissor[0].y,scissor[0].width,scissor[0].height);
  }
}

void StateSystem::ScissorState::getGL()
{
  GLuint numEqual = 1;
  for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
    glGetIntegeri_v(GL_SCISSOR_BOX,i,&scissor[i].x);
    if (i > 0 && memcmp(&scissor[i],&scissor[i-1],sizeof(scissor[i]))==0){
      numEqual++;
    }
  }

  useSeparate = (numEqual != MAX_VIEWPORTS);
}
*/
//////////////////////////////////////////////////////////////////////////

void StateSystem::ScissorEnableState::applyGL() const
{
  if (separateEnable){
    for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
      if (isBitSet(separateEnable,i))  glEnablei (GL_SCISSOR_TEST,i);
      else                                    glDisablei(GL_SCISSOR_TEST,i);
    }
  }

}

void StateSystem::ScissorEnableState::getGL()
{
  GLuint stateSet = 0;
  separateEnable = 0;
  for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
    if (setBitState(separateEnable,i, glIsEnabledi( GL_BLEND, i))) stateSet++;
  }
  if (stateSet == MAX_DRAWBUFFERS){
    separateEnable = 0;
  }
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::MaskState::applyGL() const
{
  if (colormaskUseSeparate){
    for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
      glColorMaski(i, colormask[i][0],colormask[i][1],colormask[i][2],colormask[i][3]);
    }
  }
  else{
    glColorMask( colormask[0][0],colormask[0][1],colormask[0][2],colormask[0][3] );
  }
  glDepthMask(depth);
  glStencilMaskSeparate(GL_FRONT, stencil[FACE_FRONT]);
  glStencilMaskSeparate(GL_BACK,  stencil[FACE_BACK]);
}

void StateSystem::MaskState::getGL()
{
  glGetBooleanv(GL_DEPTH_WRITEMASK,&depth);
  glGetIntegerv(GL_STENCIL_WRITEMASK, (GLint*)&stencil[FACE_FRONT]);
  glGetIntegerv(GL_STENCIL_BACK_WRITEMASK, (GLint*)&stencil[FACE_BACK]);

  int numEqual = 1;
  for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
    glGetBooleani_v(GL_COLOR_WRITEMASK, i, colormask[i]);

    if ( i > 0 && memcmp(colormask[i],colormask[i-1],sizeof(colormask[i]))==0){
      numEqual++;
    }
  }

  colormaskUseSeparate = numEqual != MAX_DRAWBUFFERS;
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::FBOState::applyGL(bool skipFboBinding) const
{
  if (!skipFboBinding){
    glBindFramebuffer(GL_DRAW_FRAMEBUFFER,fboDraw);
    glBindFramebuffer(GL_READ_FRAMEBUFFER,fboRead);
  }
  glDrawBuffers(numBuffers,drawBuffers);
  glReadBuffer(readBuffer);
}

void StateSystem::FBOState::getGL()
{
  glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING,(GLint*)&fboDraw);
  glGetIntegerv(GL_READ_FRAMEBUFFER_BINDING,(GLint*)&fboRead);

  glGetIntegerv(GL_READ_BUFFER,(GLint*)&readBuffer);

  for (int i = 0; i < MAX_DRAWBUFFERS; i++){
    glGetIntegerv(GL_DRAW_BUFFER0 + i,(GLint*)&drawBuffers[i]);
    if (drawBuffers[i] != GL_NONE){
      numBuffers = i+1;
    }
  }
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::VertexEnableState::applyGL(GLbitfield changed) const
{
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    if (isBitSet(changed,i)){
      if (isBitSet(enabled,i))  glEnableVertexAttribArray(i);
      else                      glDisableVertexAttribArray(i);
    }
  }
}

void StateSystem::VertexEnableState::getGL()
{
  enabled = 0;
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    GLint status;
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_ARRAY_ENABLED, (GLint*)&status);
    setBitState(enabled,i, status);
  }
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::VertexFormatState::applyGL(GLbitfield changedFormat, GLbitfield changedBinding) const
{
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    if (!isBitSet(changedFormat,i)) continue;

    switch(formats[i].mode){
    case VERTEXMODE_FLOAT:
      glVertexAttribFormat(i, formats[i].size, formats[i].type, formats[i].normalized, formats[i].relativeoffset);
      break;
    case VERTEXMODE_INT:
    case VERTEXMODE_UINT:
      glVertexAttribIFormat(i, formats[i].size, formats[i].type, formats[i].relativeoffset);
      break;
    }
    glVertexAttribBinding(i,formats[i].binding);
  }

  for (GLuint i = 0; i < MAX_VERTEXBINDINGS; i++){
    if (!isBitSet(changedBinding,i)) continue;

    glVertexBindingDivisor(i,bindings[i].divisor);
    glBindVertexBuffer(i,0,0,bindings[i].stride);
  }
}

void StateSystem::VertexFormatState::getGL()
{
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    GLint status = 0;
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_RELATIVE_OFFSET, (GLint*)&formats[i].relativeoffset);
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_ARRAY_SIZE, (GLint*)&formats[i].size);
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_ARRAY_TYPE, (GLint*)&formats[i].type);
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_ARRAY_NORMALIZED, (GLint*)&status);
    formats[i].normalized = status;
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_ARRAY_INTEGER, (GLint*)&status);
    if (status){
      formats[i].mode = VERTEXMODE_INT;
    }
    else{
      formats[i].mode = VERTEXMODE_FLOAT;
    }
    glGetVertexAttribiv(i,GL_VERTEX_ATTRIB_BINDING, (GLint*)&formats[i].binding);
  }

  for (GLuint i = 0; i < MAX_VERTEXBINDINGS; i++){
    glGetIntegeri_v(GL_VERTEX_BINDING_DIVISOR,i,(GLint*)&bindings[i].divisor);
    glGetIntegeri_v(GL_VERTEX_BINDING_STRIDE, i,(GLint*)&bindings[i].stride);
  }
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::VertexImmediateState::applyGL(GLbitfield changed) const
{
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    if (!isBitSet(changed,i)) continue;

    switch(data[i].mode){
    case VERTEXMODE_FLOAT:
      glVertexAttrib4fv(i,data[i].floats);
      break;
    case VERTEXMODE_INT:
      glVertexAttribI4iv(i,data[i].ints);
      break;
    case VERTEXMODE_UINT:
      glVertexAttribI4uiv(i,data[i].uints);
      break;
    }
  }
}

void StateSystem::VertexImmediateState::getGL()
{
  for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
    switch(data[i].mode){
    case VERTEXMODE_FLOAT:
      glGetVertexAttribfv(i,GL_CURRENT_VERTEX_ATTRIB,data[i].floats);
      break;
    case VERTEXMODE_INT:
      glGetVertexAttribIiv(i,GL_CURRENT_VERTEX_ATTRIB,data[i].ints);
      break;
    case VERTEXMODE_UINT:
      glGetVertexAttribIuiv(i,GL_CURRENT_VERTEX_ATTRIB,data[i].uints);
      break;
    }
  }
}

//////////////////////////////////////////////////////////////////////////

void StateSystem::ProgramState::applyGL() const
{
  glUseProgram(program);
}

void StateSystem::ProgramState::getGL()
{
  glGetIntegerv(GL_CURRENT_PROGRAM, (GLint*)&program);
}

//////////////////////////////////////////////////////////////////////////

// keep in sync!
static GLenum s_stateEnums[StateSystem::NUM_STATEBITS] = {
  GL_BLEND,
  GL_COLOR_LOGIC_OP,
  GL_CULL_FACE,
  GL_DEPTH_CLAMP,
  GL_DEPTH_TEST,
  GL_DITHER,
  GL_FRAMEBUFFER_SRGB,
  GL_LINE_SMOOTH,
  GL_MULTISAMPLE,
  GL_POLYGON_OFFSET_FILL,
  GL_POLYGON_OFFSET_LINE,
  GL_POLYGON_OFFSET_POINT,
  GL_POLYGON_SMOOTH,
  GL_PRIMITIVE_RESTART,
  GL_PRIMITIVE_RESTART_FIXED_INDEX,
  GL_RASTERIZER_DISCARD,
  GL_SAMPLE_ALPHA_TO_COVERAGE,
  GL_SAMPLE_ALPHA_TO_ONE,
  GL_SAMPLE_COVERAGE,
  GL_SAMPLE_SHADING,
  GL_SAMPLE_MASK,
  GL_STENCIL_TEST,
  GL_SCISSOR_TEST,
  GL_TEXTURE_CUBE_MAP_SEAMLESS,
  GL_PROGRAM_POINT_SIZE,
};

void StateSystem::EnableState::applyGL(GLbitfield changedBits) const
{
  for (GLuint i = 0; i < NUM_STATEBITS; i++){
    if (isBitSet(changedBits,i)){
      if (isBitSet(stateBits,i))  glEnable  (s_stateEnums[i]);
      else                        glDisable (s_stateEnums[i]);
    }
  }
}

void StateSystem::EnableState::getGL()
{
  for (GLuint i = 0; i < NUM_STATEBITS; i++){
    setBitState(stateBits,i, glIsEnabled(s_stateEnums[i]));
  }
}

//////////////////////////////////////////////////////////////////////////
#if STATESYSTEM_USE_DEPRECATED
static GLenum s_stateEnumsDepr[StateSystem::NUM_STATEBITSDEPR] = {
  GL_ALPHA_TEST,
  GL_LINE_STIPPLE,
  GL_POINT_SMOOTH,
  GL_POINT_SPRITE,
  GL_POLYGON_STIPPLE,
};

void StateSystem::EnableStateDepr::applyGL(GLbitfield changedBits) const
{
  for (GLuint i = 0; i < NUM_STATEBITSDEPR; i++){
    if (isBitSet(changedBits,i)){
      if (isBitSet(stateBitsDepr,i))  glEnable  (s_stateEnumsDepr[i]);
      else                            glDisable (s_stateEnumsDepr[i]);
    }
  }
}

void StateSystem::EnableStateDepr::getGL()
{
  for (GLuint i = 0; i < NUM_STATEBITSDEPR; i++){
    setBitState(stateBitsDepr,i, glIsEnabled(s_stateEnumsDepr[i]));
  }
}
#endif

//////////////////////////////////////////////////////////////////////////

void StateSystem::State::applyGL(bool coreonly, bool skipFboBinding) const
{
  enable.applyGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) enableDepr.applyGL();
#endif
  program.applyGL();
  clip.applyGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) alpha.applyGL();
#endif
  blend.applyGL();
  depth.applyGL();
  stencil.applyGL();
  logic.applyGL();
  primitive.applyGL();
  sample.applyGL();
  raster.applyGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) rasterDepr.applyGL();
#endif
  /*if (!isBitSet(dynamicState,DYNAMIC_VIEWPORT)){
    viewport.applyGL();
  }*/
  depthrange.applyGL();
  /*if (!isBitSet(dynamicState,DYNAMIC_SCISSOR)){
    scissor.applyGL();
  }*/
  scissorenable.applyGL();
  mask.applyGL();
  fbo.applyGL(skipFboBinding);
  vertexenable.applyGL();
  vertexformat.applyGL();
  verteximm.applyGL();
}

void StateSystem::State::getGL(bool coreonly)
{
  enable.getGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) enableDepr.applyGL();
#endif
  program.getGL();
  clip.getGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) alpha.applyGL();
#endif
  blend.getGL();
  depth.getGL();
  stencil.getGL();
  logic.getGL();
  primitive.getGL();
  sample.getGL();
  raster.getGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!coreonly) rasterDepr.applyGL();
#endif
  //viewport.getGL();
  depthrange.getGL();
  //scissor.getGL();
  scissorenable.getGL();
  mask.getGL();
  fbo.getGL();
  vertexenable.getGL();
  vertexformat.getGL();
  verteximm.getGL();
}


//////////////////////////////////////////////////////////////////////////

void StateSystem::init(bool coreonly)
{
  m_coreonly = coreonly;
}

void StateSystem::deinit()
{
  m_states.resize(0);
  m_freeIDs.resize(0);
}

void StateSystem::generate( GLuint num, StateID* objects )
{

  GLuint i;
  for ( i = 0; i < num && !m_freeIDs.empty(); i++){
    objects[i] = m_freeIDs.back();
    m_freeIDs.pop_back();
  }

  GLuint begin = GLuint(m_states.size());

  if ( i < num){
    m_states.resize( begin + num - i);
  }

  for ( i = i; i < num; i++){
    objects[i] = begin + i;
  }
}

void StateSystem::destroy( GLuint num, const StateID* objects )
{
  for (GLuint i = 0; i < num; i++){
    m_freeIDs.push_back(objects[i]);
  }
}

void StateSystem::set( StateID id, const State& state, GLenum basePrimitiveMode )
{
  StateInternal& intstate   = m_states[id];
  intstate.changeID++;
  intstate.state = state;
  intstate.state.basePrimitiveMode = basePrimitiveMode;

  intstate.usedDiff = 0;
  for (int i = 0; i < MAX_DIFFS; i++){
    intstate.others[i].state = INVALID_ID;
  }
}

const StateSystem::State& StateSystem::get( StateID id ) const
{
  return m_states[id].state;
}

int inline StateSystem::prepareTransitionCache(StateID prev, StateInternal& to )
{
  StateInternal& from = m_states[prev];

  int index = -1;

  for (int i = 0; i < MAX_DIFFS; i++){
    if ( to.others[i].state == prev && to.others[i].changeID == from.changeID) {
      index = i;
      break;
    }
  }

  if (index < 0){
    index = to.usedDiff;
    to.usedDiff = (to.usedDiff + 1) % MAX_DIFFS;

    to.others[index].state = prev;
    to.others[index].changeID = from.changeID;

    makeDiff(to.diffs[index], from, to);
  }

  return index;
}

void StateSystem::applyGL( StateID id, bool skipFboBinding ) const
{
  m_states[id].state.applyGL( m_coreonly, skipFboBinding );
}

void StateSystem::applyGL( StateID id, StateID prev, bool skipFboBinding )
{
  StateInternal& to   = m_states[id];

  if (prev == INVALID_ID){
    applyGL(id, skipFboBinding);
    return;
  }

  int index = prepareTransitionCache(prev, to);
  applyDiffGL( to.diffs[index], to.state, skipFboBinding );

}

void StateSystem::applyDiffGL( const StateDiff& diff, const State &state, bool skipFboBinding )
{
  if (isBitSet(diff.changedContentBits,StateDiff::ENABLE))
    state.enable.applyGL(diff.changedStateBits);
#if STATESYSTEM_USE_DEPRECATED
  if (!m_coreonly && isBitSet(diff.changedContentBits,StateDiff::ENABLE_DEPR))
    state.enableDepr.applyGL(diff.changedStateDeprBits);
#endif
  if (isBitSet(diff.changedContentBits,StateDiff::PROGRAM))
    state.program.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::CLIP))
    state.clip.applyGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!m_coreonly && isBitSet(diff.changedContentBits,StateDiff::ALPHA_DEPR))
    state.alpha.applyGL();
#endif
  if (isBitSet(diff.changedContentBits,StateDiff::BLEND))
    state.blend.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::DEPTH))
    state.depth.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::STENCIL))
    state.stencil.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::LOGIC))
    state.logic.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::PRIMITIVE))
    state.primitive.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::RASTER))
    state.raster.applyGL();
#if STATESYSTEM_USE_DEPRECATED
  if (!m_coreonly && isBitSet(diff.changedContentBits,StateDiff::RASTER_DEPR))
    state.rasterDepr.applyGL();
#endif
  /*if (isBitSet(diff.changedContentBits,StateDiff::VIEWPORT))
    state.viewport.applyGL();*/
  if (isBitSet(diff.changedContentBits,StateDiff::DEPTHRANGE))
    state.depthrange.applyGL();
  /*if (isBitSet(diff.changedContentBits,StateDiff::SCISSOR))
    state.scissor.applyGL();*/
  if (isBitSet(diff.changedContentBits,StateDiff::SCISSORENABLE))
    state.scissorenable.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::MASK))
    state.mask.applyGL();
  if (isBitSet(diff.changedContentBits,StateDiff::FBO))
    state.fbo.applyGL(skipFboBinding);
  if (isBitSet(diff.changedContentBits,StateDiff::VERTEXENABLE))
    state.vertexenable.applyGL(diff.changedVertexEnable);
  if (isBitSet(diff.changedContentBits,StateDiff::VERTEXFORMAT))
    state.vertexformat.applyGL(diff.changedVertexFormat, diff.changedVertexBinding);
  if (isBitSet(diff.changedContentBits,StateDiff::VERTEXIMMEDIATE))
    state.verteximm.applyGL(diff.changedVertexImm);
}


void StateSystem::makeDiff( StateDiff& diff, const StateInternal &fromInternal, const StateInternal &toInternal )
{
  const State &from = fromInternal.state;
  const State &to   = toInternal.state;

  diff.changedStateBits     = from.enable.stateBits ^ to.enable.stateBits;
#if STATESYSTEM_USE_DEPRECATED
  diff.changedStateDeprBits = from.enableDepr.stateBitsDepr ^ to.enableDepr.stateBitsDepr;
#endif
  diff.changedContentBits   = 0;
  
  if (memcmp(&from.enable         ,&to.enable         ,sizeof(from.enable         )) != 0) setBit(diff.changedContentBits,StateDiff::ENABLE);
#if STATESYSTEM_USE_DEPRECATED
  if (memcmp(&from.enableDepr     ,&to.enableDepr     ,sizeof(from.enableDepr     )) != 0) setBit(diff.changedContentBits,StateDiff::ENABLE_DEPR);
#endif
  if (memcmp(&from.program        ,&to.program        ,sizeof(from.program        )) != 0) setBit(diff.changedContentBits,StateDiff::PROGRAM);
  if (memcmp(&from.clip           ,&to.clip           ,sizeof(from.clip           )) != 0) setBit(diff.changedContentBits,StateDiff::CLIP);
#if STATESYSTEM_USE_DEPRECATED
  if (memcmp(&from.alpha          ,&to.alpha          ,sizeof(from.alpha          )) != 0) setBit(diff.changedContentBits,StateDiff::ALPHA_DEPR);
#endif
  if (memcmp(&from.blend          ,&to.blend          ,sizeof(from.blend          )) != 0) setBit(diff.changedContentBits,StateDiff::BLEND);
  if (memcmp(&from.depth          ,&to.depth          ,sizeof(from.depth          )) != 0) setBit(diff.changedContentBits,StateDiff::DEPTH);
  if (memcmp(&from.stencil        ,&to.stencil        ,sizeof(from.stencil        )) != 0) setBit(diff.changedContentBits,StateDiff::STENCIL);
  if (memcmp(&from.logic          ,&to.logic          ,sizeof(from.logic          )) != 0) setBit(diff.changedContentBits,StateDiff::LOGIC);
  if (memcmp(&from.primitive      ,&to.primitive      ,sizeof(from.primitive      )) != 0) setBit(diff.changedContentBits,StateDiff::PRIMITIVE);
  if (memcmp(&from.raster         ,&to.raster         ,sizeof(from.raster         )) != 0) setBit(diff.changedContentBits,StateDiff::RASTER);
#if STATESYSTEM_USE_DEPRECATED
  if (memcmp(&from.rasterDepr     ,&to.rasterDepr     ,sizeof(from.rasterDepr     )) != 0) setBit(diff.changedContentBits,StateDiff::RASTER_DEPR);
#endif
  //if (memcmp(&from.viewport       ,&to.viewport       ,sizeof(from.viewport       )) != 0) setBit(diff.changedContentBits,StateDiff::VIEWPORT);
  if (memcmp(&from.depth          ,&to.depth          ,sizeof(from.depth          )) != 0) setBit(diff.changedContentBits,StateDiff::DEPTHRANGE);
  //if (memcmp(&from.scissor        ,&to.scissor        ,sizeof(from.scissor        )) != 0) setBit(diff.changedContentBits,StateDiff::SCISSOR);
  if (memcmp(&from.scissorenable  ,&to.scissorenable  ,sizeof(from.scissorenable  )) != 0) setBit(diff.changedContentBits,StateDiff::SCISSORENABLE);
  if (memcmp(&from.mask           ,&to.mask           ,sizeof(from.mask           )) != 0) setBit(diff.changedContentBits,StateDiff::MASK);
  if (memcmp(&from.fbo            ,&to.fbo            ,sizeof(from.fbo            )) != 0) setBit(diff.changedContentBits,StateDiff::FBO);

  // special case vertex stuff, more likely to change then rest

  diff.changedVertexEnable  = from.vertexenable.enabled ^ to.vertexenable.enabled;

  diff.changedVertexImm = 0;
  diff.changedVertexFormat = 0;
  
  for (GLint i = 0; i < MAX_VERTEXATTRIBS; i++){
    if (memcmp(&from.vertexformat.formats[i], &to.vertexformat.formats[i], sizeof(to.vertexformat.formats[i])) != 0)  setBit(diff.changedVertexFormat,i);
    if (memcmp(&from.verteximm.data[i], &to.verteximm.data[i], sizeof(to.verteximm.data[i])) != 0)                    setBit(diff.changedVertexImm,i);
  }

  diff.changedVertexBinding = 0;
  for (GLint i = 0; i < MAX_VERTEXBINDINGS; i++){
    if (memcmp(&from.vertexformat.bindings[i], &to.vertexformat.bindings[i], sizeof(to.vertexformat.bindings[i])) != 0)  setBit(diff.changedVertexBinding,i);
  }

  if (diff.changedVertexEnable)                               setBit(diff.changedContentBits,StateDiff::VERTEXENABLE);
  if (diff.changedVertexBinding || diff.changedVertexFormat)  setBit(diff.changedContentBits,StateDiff::VERTEXFORMAT);
  if (diff.changedVertexImm)                                  setBit(diff.changedContentBits,StateDiff::VERTEXIMMEDIATE);
}

void StateSystem::prepareTransition( StateID id, StateID prev )
{
  StateInternal& to   = m_states[id];

  prepareTransitionCache(prev,to);
}


================================================
FILE: statesystem.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */


#ifndef STATESYSTEM_H__
#define STATESYSTEM_H__


#include <nvgl/extensions_gl.hpp>
#include <vector>

class StateSystem {
public:

  static inline bool isBitSet(GLbitfield bits, GLuint key)
  {
    return  (bits & (1<<key)) ? true : false;
  }

  static inline void setBit(GLbitfield& bits, GLuint key)
  {
    bits |= (1<<key);
  }

  static GLbitfield getBit(GLuint key)
  {
    return (1<<key);
  }

  static inline GLboolean setBitState(GLbitfield& bits, GLuint key, GLboolean state)
  {
    if (state)  bits |=  (1<<key);
    else        bits &= ~(1<<key);
    return state;
  }
  
  static const GLuint MAX_DRAWBUFFERS    = 8;
  static const GLuint MAX_CLIPPLANES     = 8;
  static const GLuint MAX_VIEWPORTS      = 16;
  static const GLuint MAX_VERTEXATTRIBS  = 16;
  static const GLuint MAX_VERTEXBINDINGS = 16;
  static const GLuint MAX_COLORS         = 4;
    
  enum StateBits {
    BLEND,
    COLOR_LOGIC_OP,
    CULL_FACE,
    DEPTH_CLAMP,
    DEPTH_TEST,
    DITHER,
    FRAMEBUFFER_SRGB,
    LINE_SMOOTH,
    MULTISAMPLE,
    POLYGON_OFFSET_FILL,
    POLYGON_OFFSET_LINE,
    POLYGON_OFFSET_POINT,
    POLYGON_SMOOTH,
    PRIMITIVE_RESTART,
    PRIMITIVE_RESTART_FIXED_INDEX,
    RASTERIZER_DISCARD,
    SAMPLE_ALPHA_TO_COVERAGE,
    SAMPLE_ALPHA_TO_ONE,
    SAMPLE_COVERAGE,
    SAMPLE_SHADING,
    SAMPLE_MASK,
    STENCIL_TEST,
    SCISSOR_TEST,
    TEXTURE_CUBE_MAP_SEAMLESS,
    PROGRAM_POINT_SIZE,
    NUM_STATEBITS,
  };
#if STATESYSTEM_USE_DEPRECATED
  enum StateBitsDepr {
    DEPR_ALPHA_TEST,
    DEPR_LINE_STIPPLE,
    DEPR_POINT_SMOOTH,
    DEPR_POINT_SPRITE,
    DEPR_POLYGON_STIPPLE,
    NUM_STATEBITSDEPR,
  };
#endif
    
  enum Faces {
    FACE_FRONT,
    FACE_BACK,
    MAX_FACES,
  };

  //////////////////////////////////////////////////////////////////////////

  struct ClipDistanceState {
    GLbitfield  enabled;

    ClipDistanceState()
    {
      enabled = 0;
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////
#if STATESYSTEM_USE_DEPRECATED
  struct AlphaStateDepr {
    GLenum    mode;
    GLfloat   refvalue;

    AlphaStateDepr()
    {
      mode      = GL_ALWAYS;
      refvalue  = 1.0;
    }

    void applyGL() const;
    void getGL();
  };
#endif
  //////////////////////////////////////////////////////////////////////////

  struct StencilOp
  {
    GLenum  fail;
    GLenum  zfail;
    GLenum  zpass;
  };
  struct StencilFunc
  {
    GLenum  func;
    GLuint  refvalue;
    GLuint  mask;
  };
  struct StencilState{
    StencilFunc funcs[MAX_FACES];
    StencilOp   ops[MAX_FACES];

    StencilState()
    {
      for (GLuint i = 0; i < MAX_FACES; i++){
        funcs[i].func = GL_ALWAYS;
        funcs[i].refvalue = 0;
        funcs[i].mask = ~0;
      }
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////
  struct BlendMode{
    GLenum srcw;
    GLenum dstw;
    GLenum equ;
  };
  struct BlendStage{
    BlendMode rgb;
    BlendMode alpha;
  };
  struct BlendState{
    GLbitfield  separateEnable; // only set this if you want per draw enable
    //GLfloat     color[4];
    GLuint      useSeparate;    // if set uses per draw, otherwise first
    BlendStage  blends[MAX_DRAWBUFFERS];

    BlendState() {
      separateEnable = 0;
      useSeparate = GL_FALSE;
      for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
        blends[i].alpha.srcw = GL_ONE;
        blends[i].alpha.dstw = GL_ZERO;
        blends[i].alpha.equ  = GL_FUNC_ADD;
        blends[i].rgb = blends[i].alpha;
      }
    }

    void applyGL() const;
    void getGL();
  };
  //////////////////////////////////////////////////////////////////////////
  
  struct DepthState {
    GLenum  func;
    // depth bounds for NV?

    DepthState() {
      func = GL_LESS;
    }

    void applyGL() const;
    void getGL();
  };
  //////////////////////////////////////////////////////////////////////////
  
  struct LogicState {
    GLenum  op;

    LogicState() {
      op = GL_COPY;
    }

    void applyGL() const;
    void getGL();
  };
  //////////////////////////////////////////////////////////////////////////
  
  struct RasterState {
    //GLenum    frontFace;
    GLenum    cullFace;
    //GLfloat   polyOffsetFactor;
    //GLfloat   polyOffsetUnits;
    GLenum    polyMode;   // front and back, no separate support
    //GLfloat   lineWidth;
    GLfloat   pointSize;
    GLfloat   pointFade;
    GLenum    pointSpriteOrigin;

    RasterState() {
      //frontFace = GL_CCW;
      cullFace = GL_BACK;
      //polyOffsetFactor = 0;
      //polyOffsetUnits  = 0;
      polyMode = GL_FILL;
      //lineWidth = 1.0f;
      pointSize = 1.0f;
      pointFade = 1.0f;
      pointSpriteOrigin = GL_UPPER_LEFT;
    }

    void applyGL() const;
    void getGL();
  };

#if STATESYSTEM_USE_DEPRECATED
  struct RasterStateDepr {
    GLint     lineStippleFactor;
    GLushort  lineStipplePattern;
    GLenum    shadeModel;
    // ignore polygonStipple

    RasterStateDepr() {
      lineStippleFactor   = 1;
      lineStipplePattern  = ~0;
      shadeModel  = GL_SMOOTH;
    }

    void applyGL() const;
    void getGL();
  };
#endif

  //////////////////////////////////////////////////////////////////////////

  struct PrimitiveState {
    GLuint    restartIndex;
    GLint     patchVertices;
    GLenum    provokingVertex;

    PrimitiveState() {
      restartIndex = ~0;
      patchVertices = 3;
      provokingVertex = GL_LAST_VERTEX_CONVENTION;
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////

  struct SampleState {
    GLfloat   coverage;
    GLboolean invert;
    GLuint    mask;

    SampleState() {
      coverage = 1.0;
      invert = GL_FALSE;
      mask = ~0;
    }

    void applyGL() const;
    void getGL();
  };
  //////////////////////////////////////////////////////////////////////////

  struct Viewport {
    float   x;
    float   y;
    float   width;
    float   height;
  };
  struct DepthRange {
    double  nearPlane;
    double  farPlane;
  };
  struct Scissor {
    GLint   x;
    GLint   y;
    GLsizei width;
    GLsizei height;
  };

  /*
  struct ViewportState {
    GLuint        useSeparate;  // if set uses per view, otherwise first
    Viewport      viewports[MAX_VIEWPORTS];

    ViewportState() {
      useSeparate = GL_FALSE;
      for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
        viewports[i].x = 0;
        viewports[i].y = 0;
        viewports[i].width = 0;
        viewports[i].height = 0;
      }
    }

    void applyGL() const;
    void getGL();
  };
  */

  struct DepthRangeState {
    GLuint        useSeparate;  // if set uses per view, otherwise first
    DepthRange    depths[MAX_VIEWPORTS];

    DepthRangeState() {
      useSeparate = GL_FALSE;
      for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
        depths[i].nearPlane = 0;
        depths[i].farPlane  = 1;
      }
    }

    void applyGL() const;
    void getGL();
  };

  /*
  struct ScissorState {
    GLuint        useSeparate;    // if set uses per draw, otherwise first
    Scissor       scissor[MAX_VIEWPORTS];

    ScissorState() {
      useSeparate = GL_FALSE;
      for (GLuint i = 0; i < MAX_VIEWPORTS; i++){
        scissor[i].x = 0;
        scissor[i].y = 0;
        scissor[i].width = 0;
        scissor[i].height = 0;
      }
    }

    void applyGL() const;
    void getGL();
  };
  */

  struct ScissorEnableState {
    GLbitfield    separateEnable; // only set this if you want per view enable

    ScissorEnableState() {
      separateEnable = 0;
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////

  struct MaskState {
    GLuint    colormaskUseSeparate;
    GLboolean colormask[MAX_DRAWBUFFERS][MAX_COLORS];
    GLboolean depth;
    GLuint    stencil[MAX_FACES];

    MaskState() {
      colormaskUseSeparate = GL_FALSE;
      depth = GL_TRUE;
      stencil[FACE_FRONT] = ~0;
      stencil[FACE_BACK] = ~0;
      for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
        for (GLuint c = 0; c < MAX_COLORS; c++){
          colormask[i][c] = GL_TRUE;
        }
      }
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////
  
  struct FBOState {
    GLuint  fboDraw;
    GLuint  fboRead;
    GLenum  readBuffer;
    GLenum  drawBuffers[MAX_DRAWBUFFERS];
    GLuint  numBuffers;

    FBOState() {
      fboDraw = 0;
      fboRead = 0;
      readBuffer = GL_BACK;
      for (GLuint i = 0; i < MAX_DRAWBUFFERS; i++){
        drawBuffers[i] = GL_NONE;
      }
      drawBuffers[0] = GL_BACK;
      numBuffers = 1;
    }

    void setFbo(GLuint fbo){
      fboDraw = fbo;
      fboRead = fbo;
      readBuffer = GL_COLOR_ATTACHMENT0;
      drawBuffers[0] = GL_COLOR_ATTACHMENT0;
      numBuffers = 1;
    }

    void applyGL(bool noBind=false) const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////

  struct VertexEnableState {
    GLbitfield    enabled;

    VertexEnableState() {
      enabled = 0;
    }

    void applyGL(GLbitfield changed=~0) const;
    void getGL();
  };

  enum VertexModeType {
    VERTEXMODE_FLOAT,
    VERTEXMODE_INT,
    VERTEXMODE_UINT,
    // ignore double and int64 for now
  };

  struct VertexFormat {
    VertexModeType  mode;

    GLboolean normalized;
    
    GLuint    size;
    GLenum    type;
    GLsizei   relativeoffset;

    GLuint    binding;
  };

  struct VertexBinding {
    GLsizei       divisor;
    GLsizei       stride;
  };

  struct VertexFormatState {
    VertexFormat  formats[MAX_VERTEXATTRIBS];
    VertexBinding bindings[MAX_VERTEXBINDINGS];

    VertexFormatState() {
      for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
        formats[i].mode           = VERTEXMODE_FLOAT;
        formats[i].size           = 4;
        formats[i].type           = GL_FLOAT;
        formats[i].normalized     = GL_FALSE;
        formats[i].relativeoffset = 0;
        formats[i].binding        = i;
      }

      for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
        bindings[i].divisor = 0;
        bindings[i].stride  = 0;
      }
    }

    void applyGL(GLbitfield changedFormat = ~0,GLbitfield changedBinding = ~0) const;
    void getGL();
  };

  struct VertexData {
    VertexModeType  mode;
    union {
      float         floats[4];
      int           ints[4];
      unsigned int  uints[4];
    };
  };

  struct VertexImmediateState {
    VertexData  data[MAX_VERTEXATTRIBS];

    VertexImmediateState() {
      for (GLuint i = 0; i < MAX_VERTEXATTRIBS; i++){
        data[i].mode = VERTEXMODE_FLOAT;
        data[i].floats[0] = 0;
        data[i].floats[1] = 0;
        data[i].floats[2] = 0;
        data[i].floats[3] = 1;
      }
    }

    void applyGL(GLbitfield changed = ~0) const;
    void getGL(); // ensure proper mode, otherwise will get garbage
  };

  //////////////////////////////////////////////////////////////////////////

  struct ProgramState {
    // for sake of simplicity this mechanism only support programs
    // and not program pipelines, nor use of subroutines
    GLuint    program;

    ProgramState() {
      program = 0;
    }

    void applyGL() const;
    void getGL();
  };

  //////////////////////////////////////////////////////////////////////////

  struct EnableState {
    GLbitfield      stateBits;

    EnableState() {
      stateBits = 0;
    }

    void applyGL(GLbitfield changed = ~0) const;
    void getGL();
  };

#if STATESYSTEM_USE_DEPRECATED
  struct EnableStateDepr {
    GLbitfield      stateBitsDepr;

    EnableStateDepr() {
      stateBitsDepr = 0;
    }

    void applyGL(GLbitfield changed = ~0) const;
    void getGL();
  };
#endif

  //////////////////////////////////////////////////////////////////////////
  
  struct State {
    EnableState           enable;
  #if STATESYSTEM_USE_DEPRECATED
    EnableStateDepr       enableDepr;
  #endif
    ProgramState          program;
    ClipDistanceState     clip;
  #if STATESYSTEM_USE_DEPRECATED
    AlphaStateDepr        alpha;
  #endif
    BlendState            blend;
    DepthState            depth;
    StencilState          stencil;
    LogicState            logic;
    PrimitiveState        primitive;
    SampleState           sample;
    RasterState           raster;
  #if STATESYSTEM_USE_DEPRECATED
    RasterStateDepr       rasterDepr;
  #endif
    //ViewportState         viewport;
    DepthRangeState       depthrange;
    //ScissorState          scissor;
    ScissorEnableState    scissorenable;
    MaskState             mask;
    FBOState              fbo;
    VertexEnableState     vertexenable;
    VertexFormatState     vertexformat;
    VertexImmediateState  verteximm;

    // This value only exists to ease compatibility with NV_command_list
    // and is unaffected by apply or get operations, its value
    // is set during StateSystem::set
    GLenum                basePrimitiveMode; 

    State() 
      : basePrimitiveMode(GL_TRIANGLES)
    {

    }

    void    applyGL(bool coreonly=false, bool skipFboBinding=false) const;
    void    getGL(bool coreonly=false);
  };
  
  typedef unsigned int StateID;
  static const StateID  INVALID_ID = ~0;

  void    init(bool coreonly=false);
  void    deinit();
  
  void    generate(GLuint num, StateID* objects);
  void    destroy( GLuint num, const StateID* objects );
  void          set(StateID id, const State& state, GLenum basePrimitiveMode);
  const State&  get(StateID id) const;
  
  void    applyGL(StateID id, bool skipFboBinding) const;         // brute force sets everything
  void    applyGL(StateID id, StateID prev,bool skipFboBinding);  // tries to avoid redundant, can pass INVALID_ID as previous

  void    prepareTransition(StateID id, StateID prev); // can speed up state apply
  
  
private:
  static const int MAX_DIFFS = 16;

  struct StateDiffKey{
    StateID   state;
    GLuint    changeID;
  };

  struct StateDiff {

    enum ContentBits {
      ENABLE,
      ENABLE_DEPR,
      PROGRAM,
      CLIP,
      ALPHA_DEPR,
      BLEND,
      DEPTH,
      STENCIL,
      LOGIC,
      PRIMITIVE,
      RASTER,
      RASTER_DEPR,
      //VIEWPORT,
      DEPTHRANGE,
      //SCISSOR,
      SCISSORENABLE,
      MASK,
      FBO,
      VERTEXENABLE,
      VERTEXFORMAT,
      VERTEXIMMEDIATE,
    };

    GLbitfield    changedContentBits;
    GLbitfield    changedStateBits;
    GLbitfield    changedStateDeprBits;
    GLbitfield    changedVertexEnable;
    GLbitfield    changedVertexImm;
    GLbitfield    changedVertexFormat;
    GLbitfield    changedVertexBinding;
    GLuint        pad;
  };

  struct StateInternal {
    State       state;
    GLuint      changeID;
    
    int           usedDiff;
    StateDiffKey  others[MAX_DIFFS];
    StateDiff     diffs[MAX_DIFFS];

    StateInternal() {
      changeID = 0;
    }
  };

  bool                          m_coreonly;
  std::vector<StateInternal>    m_states;
  std::vector<StateID>          m_freeIDs;

  void  makeDiff(StateDiff& diff, const StateInternal &fromInternal, const StateInternal &toInternal);
  void  applyDiffGL(const StateDiff& diff, const State &to, bool skipFboBinding);
  int   prepareTransitionCache(StateID prev, StateInternal& to );
};


#endif

================================================
FILE: tokenbase.cpp
================================================
/*
 * Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */

/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include "tokenbase.hpp"

using namespace nvtoken;

#include "common.h"

namespace csfviewer
{

  bool TokenRendererBase::hasNativeCommandList()
  {
    return !!has_GL_NV_command_list;
  }

  void TokenRendererBase::init(bool bindlessUbo, bool bindlessVbo)
  {
    m_bindlessVboUbo = bindlessVbo && bindlessUbo;
    m_hwsupport = hasNativeCommandList() && !m_emulate;

    for (int i = 0; i < NUM_SHADES; i++){
      m_tokenAddresses[i] = 0;
    }

    if (m_hwsupport){
      glCreateStatesNV(NUM_STATES,m_stateObjects);

      if (m_uselist){
        glCreateCommandListsNV(NUM_SHADES,m_commandLists);
      }
    }
    else{
      // we use a fast mode for glBufferAddressRangeNV where we ignore precise buffer boundaries
      // this will trigger the driver to throw warnings, which may cause a crash
#if !defined (NDEBUG)
      if (m_bindlessVboUbo){
        glDisable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
        glDisable(GL_DEBUG_OUTPUT);
      }
#endif

      m_stateSystem.init(false);
      m_stateSystem.generate(NUM_STATES,m_stateIDs);
      for (int i = 0; i < NUM_STATES; i++){
        m_stateObjects[i]  = m_stateIDs[i];
      }
    }

    nvtokenInitInternals(m_hwsupport, m_bindlessVboUbo);
  }

  void TokenRendererBase::printStats( ShadeType shadeType )
  {
    int stats[NVTOKEN_TYPES] = {0};

    ShadeCommand& sc = m_shades[shadeType];

    size_t num = sc.states.size();
    size_t size = sc.offsets[num-1] + sc.sizes[num-1] - sc.offsets[0];

    nvtokenGetStats(&m_tokenStreams[shadeType][sc.offsets[0]], size, stats);

    LOGI("type: %s\n",toString(shadeType));
    LOGI("commandsize: %zu\n",size);
    LOGI("state toggles: %zu\n", num);
    LOGI("tokens:\n");
    for (int i = 0; i < NVTOKEN_TYPES; i++){
      const char* what = nvtokenCommandToString(i);
      if (what && stats[i]){
        LOGI("%s:\t %6d\n", what,stats[i]);
      }
    }
    LOGI("\n");
  }

  void TokenRendererBase::finalize(const Resources &resources, bool fillBuffers)
  {
    {
      m_tokenStreams[SHADE_SOLIDWIRE_SPLIT] = m_tokenStreams[SHADE_SOLIDWIRE];
      m_shades[SHADE_SOLIDWIRE_SPLIT] = m_shades[SHADE_SOLIDWIRE];
      if (USE_STATEFBO_SPLIT){
        ShadeCommand& sc = m_shades[SHADE_SOLIDWIRE_SPLIT];
        for (size_t i = 0; i < sc.sizes.size(); i++){
          if (sc.states[i] == m_stateObjects[STATE_LINES]){
            sc.states[i] = m_stateObjects[STATE_LINES_SPLIT];
          }
        }
      }
      else{
        ShadeCommand& sc = m_shades[SHADE_SOLIDWIRE_SPLIT];
        for (size_t i = 0; i < sc.sizes.size(); i++)
        {
          if (sc.states[i] == m_stateObjects[STATE_LINES]){
            sc.fbos[i] = resources.fbo2;
          }
          else{
            sc.fbos[i] = resources.fbo;
          }
        }
      }
    }

    glCreateBuffers(NUM_SHADES,m_tokenBuffers);
    if (m_hwsupport && fillBuffers){
      for (int i = 0; i < NUM_SHADES; i++){
        glNamedBufferStorage(m_tokenBuffers[i],m_tokenStreams[i].size(), &m_tokenStreams[i][0], 0);
        if (m_useaddress){
          glGetNamedBufferParameterui64vNV(m_tokenBuffers[i], GL_BUFFER_GPU_ADDRESS_NV, &m_tokenAddresses[i]);
          glMakeNamedBufferResidentNV(m_tokenBuffers[i], GL_READ_ONLY);

          ShadeCommand& sc = m_shades[i];
          sc.addresses.clear();
          sc.addresses.reserve( sc.offsets.size() );
          for (size_t n = 0; n < sc.offsets.size(); n++){
            sc.addresses.push_back( m_tokenAddresses[i] + sc.offsets[n] );
          }
        }
      }
    }
  }

  void TokenRendererBase::deinit()
  {
    if (m_useaddress){
      for (int i = 0; i < NUM_SHADES; i++){
        if (m_tokenAddresses[i]){
          glMakeNamedBufferNonResidentNV( m_tokenBuffers[i] );
        }
      }
    }

    glDeleteBuffers(NUM_SHADES,m_tokenBuffers);

    if (m_hwsupport){
      glDeleteStatesNV(NUM_STATES,m_stateObjects);
      if (m_uselist){
        glDeleteCommandListsNV(NUM_SHADES,m_commandLists);
      }
    }
    else {
#if !defined (NDEBUG)
      if (m_bindlessVboUbo){
        glEnable(GL_DEBUG_OUTPUT);
        glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
      }
#endif
    }

    m_stateSystem.deinit();
  }


  void TokenRendererBase::captureState( const Resources &resources )
  {
    bool stateChanged  = m_stateChangeID != resources.stateChangeID;
    bool fboTexChanged = m_fboStateChangeID != resources.fboTextureChangeID;

    m_stateChangeID = resources.stateChangeID;
    m_fboStateChangeID = resources.fboTextureChangeID;

    if (stateChanged){
      StateSystem::State state;
      state.verteximm.data[VERTEX_WIREMODE].mode = StateSystem::VERTEXMODE_INT; // need to set this properly


      if (m_bindlessVboUbo){
        // temp workaround
#if USE_RESETADDRESSES
        glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV,0,0,0);
        glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV,0,0,0);
        glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_MATERIAL,0,0);
        glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_MATRIX,0,0);
        glBufferAddressRangeNV(GL_UNIFORM_BUFFER_ADDRESS_NV,UBO_SCENE,0,0);
#endif
      }

      // we will do a series of state captures
      glBindFramebuffer(GL_FRAMEBUFFER, resources.fbo);
      glUseProgram(resources.programUsed);

      SetWireMode(GL_FALSE);

      if (m_hwsupport){
        glStateCaptureNV(m_stateObjects[STATE_TRIS],GL_TRIANGLES);
      }
      else {
        state.getGL(); // very costly, smarter would be setting this manually
        m_stateSystem.set(m_stateIDs[STATE_TRIS], state, GL_TRIANGLES);
      }

      glEnable(GL_POLYGON_OFFSET_FILL);
      // glPolygonOffset(1,1); //not captured


      if (m_hwsupport){
        glStateCaptureNV(m_stateObjects[STATE_TRISOFFSET],GL_TRIANGLES);
      }
      else {
        state.getGL(); // very costly, smarter would be setting this manually
        m_stateSystem.set(m_stateIDs[STATE_TRISOFFSET], state, GL_TRIANGLES);
      }

      SetWireMode(GL_TRUE);

      if (m_hwsupport){
        glStateCaptureNV(m_stateObjects[STATE_LINES],GL_LINES);
      }
      else {
        state.getGL(); // very costly, smarter would be setting this manually
        m_stateSystem.set(m_stateIDs[STATE_LINES], state, GL_LINES);
      }

      glBindFramebuffer(GL_FRAMEBUFFER, resources.fbo2);

      if (m_hwsupport){
        glStateCaptureNV(m_stateObjects[STATE_LINES_SPLIT], GL_LINES);
      }
      else {
        state.getGL(); // very costly, smarter would be setting this manually
        m_stateSystem.set(m_stateIDs[STATE_LINES_SPLIT], state, GL_LINES);
      }

      if (!m_hwsupport){
        m_stateSystem.prepareTransition(m_stateIDs[STATE_TRISOFFSET], m_stateObjects[STATE_LINES]);
        m_stateSystem.prepareTransition(m_stateIDs[STATE_LINES],      m_stateObjects[STATE_TRISOFFSET]);
        m_stateSystem.prepareTransition(m_stateIDs[STATE_TRISOFFSET], m_stateObjects[STATE_LINES_SPLIT]);
        m_stateSystem.prepareTransition(m_stateIDs[STATE_LINES_SPLIT],m_stateObjects[STATE_TRISOFFSET]);
      }

      // reset, stored in stateobjects
      glUseProgram(0);
      glDisable(GL_POLYGON_OFFSET_FILL);
      glPolygonOffset(0,0); 
#if 1
      // workaround
      glBindFramebuffer(GL_FRAMEBUFFER, resources.fbo);
#else
      glBindFramebuffer(GL_FRAMEBUFFER, 0);
#endif
    }

    if (m_hwsupport && m_uselist && (stateChanged || fboTexChanged)){
      for (int i = 0; i < NUM_SHADES; i++){
        ShadeCommand& shade = m_shades[i];

        std::vector<const void*>  ptrs;
        ptrs.reserve(shade.offsets.size());
        for (size_t p = 0; p < shade.offsets.size(); p++){
          ptrs.push_back(&m_tokenStreams[i][shade.offsets[p]]);
        }

        glCommandListSegmentsNV(m_commandLists[i],1);
        glListDrawCommandsStatesClientNV(m_commandLists[i],0, &ptrs[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], int(shade.states.size()) );
        glCompileCommandListNV(m_commandLists[i]);
      }
    }
  }

  void TokenRendererBase::renderShadeCommandSW( const void* NV_RESTRICT stream, size_t streamSize, ShadeCommand &shade )
  {
    nvtokenDrawCommandsStatesSW(stream, streamSize, &shade.offsets[0], &shade.sizes[0], &shade.states[0], &shade.fbos[0], GLuint(shade.states.size()), m_stateSystem);
  }

}


================================================
FILE: tokenbase.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

// a few performance tests
// only affect TOKEN techniques
#define USE_RESETADDRESSES    1
#define USE_FASTDRAWS         1
#define USE_STATEFBO_SPLIT    0 //otherwise fbo[] as used
#define USE_POLYOFFSETTOKEN   1

// only affects TOKEN
#define USE_STATEOBJ_REBUILD  0 // does 100 statecaptures per frame
#define USE_NOFILTER          0

// only affects TOKENSORT
#define USE_PERFRAMEBUILD     0


#include <assert.h>
#include <algorithm>
#include "renderer.hpp"
#include "nvtoken.hpp"

using namespace nvtoken;

namespace csfviewer
{
#define UBOSTAGE_VERTEX     (nvtoken::s_nvcmdlist_stages[NVTOKEN_STAGE_VERTEX])
#define UBOSTAGE_FRAGMENT   (nvtoken::s_nvcmdlist_stages[NVTOKEN_STAGE_FRAGMENT])


#if USE_FASTDRAWS
  #define NVTokenDrawElemsUsed  NVTokenDrawElems
#else
  #define NVTokenDrawElemsUsed  NVTokenDrawElemsInstanced
#endif

  class TokenRendererBase {
  public:
    enum StateType {
      STATE_TRIS,
      STATE_TRISOFFSET,
      STATE_LINES,
      STATE_LINES_SPLIT,
      NUM_STATES,
    };

    struct ShadeCommand {
      std::vector<GLuint64>   addresses;
      std::vector<GLintptr>   offsets;
      std::vector<GLsizei>    sizes;
      std::vector<GLuint>     states;
      std::vector<GLuint>     fbos;
    };

    bool  m_emulate;
    bool  m_sort;
    bool  m_uselist;
    bool  m_useaddress;

    TokenRendererBase()
      : m_hwsupport(false)
      , m_bindlessVboUbo(false)
      , m_useaddress(false)
      , m_emulate(false)
      , m_uselist(false)
      , m_sort(false)
      , m_stateChangeID(~0)
      , m_fboStateChangeID(~0)
    {

    }

    static bool hasNativeCommandList();

  protected:

    bool                        m_hwsupport;
    bool                        m_bindlessVboUbo;

    GLuint                      m_tokenBuffers[NUM_SHADES];
    GLuint64                    m_tokenAddresses[NUM_SHADES];
    std::string                 m_tokenStreams[NUM_SHADES];
    GLuint                      m_commandLists[NUM_SHADES];
    ShadeCommand                m_shades[NUM_SHADES];

    size_t                      m_stateChangeID;
    size_t                      m_fboStateChangeID;

    StateSystem                 m_stateSystem;
    StateSystem::StateID        m_stateIDs[NUM_STATES];
    GLuint                      m_stateObjects[NUM_STATES];

    void init(bool bindlessUbo, bool bindlessVbo);
    void printStats(ShadeType shadeType);
    void finalize(const Resources &resources, bool fillBuffers=true);
    void deinit();

    void captureState(const Resources &resources);

    void renderShadeCommandSW( const void* NV_RESTRICT stream, size_t streamSize, ShadeCommand &shade );
  };
}


================================================
FILE: transform-leaves.comp.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#ifndef USE_COMPUTE
#define USE_COMPUTE 1
#endif

#define MAX_LEVELS 10

#define LEVELBITS 8

#define MATRIX_BASE     0
#define MATRIX_INVTRANS 1

#define MATRIX_BEGIN_WORLD  0
#define MATRIX_BEGIN_OBJECT 2
#define MATRICES            4


#if USE_COMPUTE

  layout (local_size_x = 256) in;

  layout(std430,binding=2) buffer scratchBuffer {
    int nodes[];
  };

  layout(location=0) uniform int count;
  layout(location=1) uniform int levelcap; // must be >= 1
  
  #define BAILOUT gl_GlobalInvocationID.x >= count
  int self = nodes[gl_GlobalInvocationID.x];

#else
  layout(location=0) uniform int levelcap; // must be >= 1

  #define BAILOUT false
  layout(location=0) in int self;

#endif

layout(binding=0) uniform isamplerBuffer parentsBuffer;

layout(std430,binding=0) restrict buffer worldMatricesBuffer {
  mat4 worldMatrices[];
};

layout(binding=1) uniform samplerBuffer texWorldMatrices;
layout(binding=2) uniform samplerBuffer texObjectMatrices;

mat4 getMatrix(samplerBuffer texbuffer, int idx)
{
  return mat4(texelFetch(texbuffer,idx*4 + 0),
              texelFetch(texbuffer,idx*4 + 1),
              texelFetch(texbuffer,idx*4 + 2),
              texelFetch(texbuffer,idx*4 + 3));
}

mat4 getObjectMatrix(int idx, int what){
  return getMatrix(texObjectMatrices,idx*MATRICES + what + MATRIX_BEGIN_OBJECT);
};

mat4 getWorldMatrix(int idx, int what){
  return getMatrix(texWorldMatrices,idx*MATRICES + what + MATRIX_BEGIN_WORLD);
};

void main()
{
  if (BAILOUT){
    return;
  }
  
  int  levels[MAX_LEVELS];
  int  curlevel = 0;
  
  // build path to root
  while (curlevel < MAX_LEVELS){
    levels[curlevel++] = self;
    int info = texelFetch(parentsBuffer,self).x;
        self = info >> LEVELBITS;
    int lvl  = info & ((1<<LEVELBITS)-1);
    if (lvl == levelcap){
      break;
    }
  }
  
  // init root
  mat4 parentBase = getWorldMatrix(self,MATRIX_BASE);
  
  while( curlevel-- > 0) {
    self = levels[curlevel];
    
    // walk downwards, save matrix in registers & save at end
    // never read worldmatrices due to read/write hazards
   
    parentBase = parentBase * getObjectMatrix(self,MATRIX_BASE);

    worldMatrices[self*MATRICES + MATRIX_BEGIN_WORLD + MATRIX_BASE]     = parentBase;
    worldMatrices[self*MATRICES + MATRIX_BEGIN_WORLD + MATRIX_INVTRANS] = transpose(inverse(parentBase));
  }
}


================================================
FILE: transform-level.comp.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#ifndef USE_COMPUTE
#define USE_COMPUTE 1
#endif

#define LEVELBITS 8

#define MATRIX_BASE         0
#define MATRIX_INVTRANS     1

#define MATRIX_BEGIN_WORLD  0
#define MATRIX_BEGIN_OBJECT 2
#define MATRICES            4

#if USE_COMPUTE

  layout (local_size_x = 256) in;

  layout(std430,binding=2) buffer scratchBuffer {
    int nodes[];
  };

  layout(location=0) uniform int count;

  #define BAILOUT gl_GlobalInvocationID.x >= count
  int self = nodes[gl_GlobalInvocationID.x];

#else

  #define BAILOUT false
  layout(location=0) in int self;

#endif

layout(binding=0) uniform isamplerBuffer parentsBuffer;

layout(std430,binding=0) restrict buffer worldMatricesBuffer {
  mat4 worldMatrices[];
};

layout(binding=1) uniform samplerBuffer texWorldMatrices;
layout(binding=2) uniform samplerBuffer texObjectMatrices;

mat4 getMatrix(samplerBuffer texbuffer, int idx)
{
  return mat4(texelFetch(texbuffer,idx*4 + 0),
              texelFetch(texbuffer,idx*4 + 1),
              texelFetch(texbuffer,idx*4 + 2),
              texelFetch(texbuffer,idx*4 + 3));
}

mat4 getObjectMatrix(int idx, int what){
  return getMatrix(texObjectMatrices,idx*MATRICES + what + MATRIX_BEGIN_OBJECT);
};

mat4 getWorldMatrix(int idx, int what){
  return getMatrix(texWorldMatrices,idx*MATRICES + what + MATRIX_BEGIN_WORLD);
};


void main()
{
  if (BAILOUT){
    return;
  }

  int parent = texelFetch(parentsBuffer,self).x >> LEVELBITS;
  
  // world base matrix
  mat4 world = 
    getWorldMatrix(parent,MATRIX_BASE) *
    getObjectMatrix(self,MATRIX_BASE);


#if 0
  // world inv trans matrix
  mat4 parentInv = transpose(getWorldMatrix(parent,MATRIX_INVTRANS));
  mat4 objectInv = transpose(getObjectMatrix(self, MATRIX_INVTRANS));

  mat4 worldInv  = objectInv * parentInv;
#else
  mat4 worldInv = inverse(world);
#endif

  worldMatrices[self*MATRICES + MATRIX_BEGIN_WORLD + MATRIX_BASE]     = world;
  worldMatrices[self*MATRICES + MATRIX_BEGIN_WORLD + MATRIX_INVTRANS] = transpose(worldInv);
}


================================================
FILE: transformsystem.cpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#include <assert.h>

#include "transformsystem.hpp"
#include <nvgl/base_gl.hpp>

void TransformSystem::process(const NodeTree& nodeTree, Buffer& ids, Buffer& matricesObject, Buffer& matricesWorld )
{
  glUseProgram(m_programs.transform_leaves);

  glBindBuffer    (GL_SHADER_STORAGE_BUFFER,  m_scratchGL);
  glBufferData    (GL_SHADER_STORAGE_BUFFER,  sizeof(GLuint)*nodeTree.getNumActiveNodes(),NULL,GL_STREAM_DRAW);

#if 0
  // APIC hack
  glTextureBufferEXT(m_texsGL[TEXTURE_IDS],   GL_TEXTURE_BUFFER, GL_R32I,    ids.buffer);
  glTextureBufferEXT(m_texsGL[TEXTURE_OBJECT],GL_TEXTURE_BUFFER, GL_RGBA32F, matricesObject.buffer);
  glTextureBufferEXT(m_texsGL[TEXTURE_WORLD], GL_TEXTURE_BUFFER, GL_RGBA32F, matricesWorld.buffer);
#else
  glTextureBufferRange(m_texsGL[TEXTURE_IDS],     GL_R32I, ids.buffer, ids.offset, ids.size);
  glTextureBufferRange(m_texsGL[TEXTURE_OBJECT],  GL_RGBA32F, matricesObject.buffer, matricesObject.offset, matricesObject.size);
  glTextureBufferRange(m_texsGL[TEXTURE_WORLD],   GL_RGBA32F, matricesWorld.buffer, matricesWorld.offset, matricesWorld.size);
#endif

  for (int i = 0; i < TEXTURES; i++){
    nvgl::bindMultiTexture(GL_TEXTURE0 + i, GL_TEXTURE_BUFFER, m_texsGL[i]);
  }

  matricesWorld.BindBufferRange(GL_SHADER_STORAGE_BUFFER,0);
  matricesObject.BindBufferRange(GL_SHADER_STORAGE_BUFFER,1);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,2,m_scratchGL);

  const int maxshaderlevels = 10;
  int maxlevels = maxshaderlevels;
  int totalNodes = 0;
  bool useLeaves = true;

  int currentDepth = 1;
  const NodeTree::Level* level = nodeTree.getUsedLevel(currentDepth);

  // TODO:
  //
  // This code lacks a proper heuristic for switching between level and leaves based processing.
  // One should prefer level if there is enough nodes per level, otherwise descend and gather 
  // many leaves from multiple levels.
  //
  while (level){
    // dispatch on last level, or if we have reached maxlevels
    bool willdispatch = currentDepth && (!nodeTree.getUsedLevel(currentDepth+1) || currentDepth+1 % maxlevels == 0);

    // the last level in leaf mode, must use all level nodes, and not just the leaves of this level
    // as subsequent leaves operate in level mode
    const std::vector<NodeTree::nodeID>& nodes = useLeaves && !willdispatch ? level->leaves : level->nodes;

    if (!nodes.empty()){
      glBufferSubData(GL_SHADER_STORAGE_BUFFER,totalNodes*sizeof(GLuint),sizeof(GLuint)*nodes.size(),&nodes[0]);
      totalNodes += (int)nodes.size();
    }

    currentDepth++;
    level = nodeTree.getUsedLevel(currentDepth);
    if (willdispatch){
      int groupsize = useLeaves ? m_leavesGroup : m_levelsGroup;
      if (useLeaves){
        glUniform1i(0,totalNodes);
        glUniform1i(1,1);
      }
      else{
        glUniform1i(0,totalNodes);
      }
      
      glDispatchCompute((totalNodes+groupsize-1)/groupsize,1,1);
      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT);

      if (useLeaves){
        // switch to per-level mode after first batch of leaves is over (tip of hierarchy)
        glUseProgram(m_programs.transform_level);
        useLeaves = false;
        maxlevels = 1; // assure we dispatch every level
      }

      totalNodes = 0;
    }
  }

  glUseProgram(0);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,0,0);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,1,0);
  glBindBufferBase(GL_SHADER_STORAGE_BUFFER,2,0);

  for (int i = 0; i < TEXTURES; i++){
    nvgl::bindMultiTexture(GL_TEXTURE0 + i, GL_TEXTURE_BUFFER, 0);
  }
  
}

void TransformSystem::init( const Programs &programs )
{
  m_programs = programs;
  glCreateBuffers(1,&m_scratchGL);
  glCreateTextures(GL_TEXTURE_BUFFER, TEXTURES, m_texsGL);
}

void TransformSystem::deinit()
{
  glDeleteBuffers(1,&m_scratchGL);
  glDeleteTextures(TEXTURES,m_texsGL);
}

void TransformSystem::update( const Programs &programs )
{
  m_programs = programs;

  GLuint groupsizes[3];
  glGetProgramiv(programs.transform_leaves, GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupsizes);
  m_leavesGroup = groupsizes[0];

  glGetProgramiv(programs.transform_level, GL_COMPUTE_WORK_GROUP_SIZE, (GLint*)groupsizes);
  m_levelsGroup = groupsizes[0];
}


================================================
FILE: transformsystem.hpp
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


/* Contact ckubisch@nvidia.com (Christoph Kubisch) for feedback */

#ifndef TRANSFORMSYSTEM_H__
#define TRANSFORMSYSTEM_H__


#include <nvgl/extensions_gl.hpp>
#include <cstddef>

#include "nodetree.hpp"

class TransformSystem {
public:

  struct Programs {
    GLuint  transform_level;
    GLuint  transform_leaves;
  };

  struct Buffer {
    GLuint      buffer;
    GLintptr    offset;
    GLsizeiptr  size;

    Buffer(GLuint buffer, size_t sizei=0)
      : buffer(buffer)
      , offset(0)
    {
      glBindBuffer(GL_COPY_READ_BUFFER, buffer);
      if (!sizei){
        if (sizeof(GLsizeiptr) > 4)
          glGetBufferParameteri64v(GL_COPY_READ_BUFFER,GL_BUFFER_SIZE, (GLint64*)&size);
        else
          glGetBufferParameteriv(GL_COPY_READ_BUFFER, GL_BUFFER_SIZE, (GLint*)&size);
        glBindBuffer(GL_COPY_READ_BUFFER, 0);
      }
      else{
        size = sizei;
      }
    }

    Buffer()
      : buffer(0)
      , offset(0)
      , size(0)
    {

    }

    inline void BindBufferRange(GLenum target, GLuint index) const {
      glBindBufferRange(target, index, buffer, offset, size);
    }
    inline void TexBuffer(GLenum target, GLenum internalformat) const {
      glTexBufferRange(target, internalformat, buffer, offset, size);
    }
   
  };
  
  void init( const Programs &programs );
  void deinit();
  void update( const Programs &programs );
  
  void process(const NodeTree&, Buffer& ids, Buffer& matricesObject, Buffer& matricesWorld );
  
private:

  enum Textures {
    TEXTURE_IDS,
    TEXTURE_WORLD,
    TEXTURE_OBJECT,
    TEXTURES,
  };

  GLuint    m_leavesGroup;
  GLuint    m_levelsGroup;

  Programs  m_programs;
  GLuint    m_scratchGL;
  GLuint    m_texsGL[TEXTURES];
};

#endif


================================================
FILE: xplode-animation.comp.glsl
================================================
/*
 * Copyright (c) 2014-2021, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * SPDX-FileCopyrightText: Copyright (c) 2014-2021 NVIDIA CORPORATION
 * SPDX-License-Identifier: Apache-2.0
 */


#version 430
/**/

#ifndef USE_COMPUTE
#define USE_COMPUTE 1
#endif

#define MATRIX_BASE         0
#define MATRIX_INVTRANS     1

#define MATRIX_BEGIN_WORLD  0
#define MATRIX_BEGIN_OBJECT 2
#define MATRICES            4

layout(location=0) uniform float scale;

#if USE_COMPUTE

  layout (local_size_x = 256) in;

  layout(location=1) uniform int count;

  #define BAILOUT gl_GlobalInvocationID.x >= count
  int self = int(gl_GlobalInvocationID.x);

#else

  #define BAILOUT false
  int self = int(gl_VertexID);

#endif

layout(std430,binding=0) restrict buffer matricesBuffer {
  mat4 matrices[];
};

layout(binding=0) uniform samplerBuffer texMatricesOrig;
mat4 getMatrix(samplerBuffer texbuffer, int idx)
{
  return mat4(texelFetch(texbuffer,idx*4 + 0),
              texelFetch(texbuffer,idx*4 + 1),
              texelFetch(texbuffer,idx*4 + 2),
              texelFetch(texbuffer,idx*4 + 3));
}

mat4 getObjectMatrixOrig(int idx, int what){
  return getMatrix(texMatricesOrig,idx*MATRICES + what + MATRIX_BEGIN_OBJECT);
};

mat4 getWorldMatrixOrig(int idx, int what){
  return getMatrix(texMatricesOrig,idx*MATRICES + what + MATRIX_BEGIN_WORLD);
};

void main()
{
  if (BAILOUT){
    return;
  }
  
  mat4 matrixOrig     = getObjectMatrixOrig(self,MATRIX_BASE);
  mat4 matrixITOrig   = getObjectMatrixOrig(self,MATRIX_INVTRANS);
  
#if 0
  // compiler bug
  mat4 matrixBase = matrixOrig;
  mat4 matrixIT   = matrixITOrig;
  matrixBase[3].xyz *= scale;
  matrixIT[0].w /= scale;
  matrixIT[1].w /= scale;
  matrixIT[2].w /= scale;
#else
  vec4 basescale  = vec4(scale,scale,scale,1);
  vec4 itscale    = vec4(1,1,1,1/scale);
  mat4 matrixBase = mat4(matrixOrig[0], matrixOrig[1], matrixOrig[2], matrixOrig[3]*basescale);
  mat4 matrixIT   = mat4(matrixITOrig[0]*itscale,matrixITOrig[1]*itscale,matrixITOrig[2]*itscale,matrixITOrig[3]);
#endif

  matrices[self*MATRICES + MATRIX_BEGIN_OBJECT + MATRIX_BASE]     = matrixBase;
  matrices[self*MATRICES + MATRIX_BEGIN_OBJECT + MATRIX_INVTRANS] = matrixIT;
}