Click below to see a short video clip of the result.
Overview:
This tutorial is based on the geometry shader/ compute buffer tutorial which can be followed here. The purpose of this example is to demonstrate how to use compute shaders and draw indirect in a straightforward manner. There are many ways you could extend this example to make it more performant. For example, you could factor out the geometry shader by having the compute shader export quads or a pair of triangles per pixel at initialization time.
Why Should I Care?
Compute Shaders are great for many reasons. For one, they allow you to run tons of generic calculations on the GPU without having to jump through hoops writing graphics code. Compute Shaders can accept, as input, any kind of buffer with any kind of data in it. You could even pass sound data to the GPU to process and then read it back to the CPU to save as an audio file. You can also think up ways to compute data at initialization time and then reference it in a classic rasterization shader while your CPU is doing other work. Keep in mind, anytime you send or receive data between the CPU and GPU, there is a performance cost. Imagine you generate a procedural mesh in your compute shader and then desire to draw it. In order to submit a draw call on the CPU, you need to know how many vertices you have. However, the compute shader generated the mesh data procedurally on the GPU so you have no idea how many triangles it generated. What do you do? Well, you could read the buffer data back from the GPU to the CPU but that is pretty unfortunate because the only reason you are reading the data back is to know what many vertices the GPU generated just so that you can send that vertex count back to the GPU! That is like a guy in New York City sending a letter to another guy in Seattle asking what restaurants are in Brooklyn. Pretty inefficient! Fortunately, we have a solution for this called DrawProceduralIndirect! After you finish generating/filling your compute buffer with data, you can use another compute buffer to store the draw call arguments you would have normally passed into DrawProcedural. Now you don’t need to read any data back from the GPU to the CPU! All the compute buffer data and draw call args are all in GPU local memory so they are very fast for the GPU to fetch and use. DrawProceduralIndirect is a way for the CPU to say “I have no idea how many triangles are in whatever it is the GPU made in the compute shader but the GPU knows so just draw it!”
Frame Debugger Capture:
The following capture shows that there is a single instanced draw call used to draw all the points.
The following capture shows the contents of the compute buffer that was uploaded to the GPU and used by the rasterization shader.
Code:
ComputeShaderExample.cs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
// This example demonstrates how to use compute shaders // to do a cool effect which would be hard to do without // compute shaders. The compute shader will also provide // the draw call arguments so we can do an indirect draw call // and keep most of the work on the GPU. // The output from the compute shader will be a set of points // which will be expanded by the geometry shader into quads // composed on two triangles. using System.Collections; using System.Collections.Generic; using System.Runtime.InteropServices; using UnityEngine; public class ComputeShaderExample : MonoBehaviour { public Material geometryMaterial; public Texture image; public Texture colorNoise; public ComputeShader computeShader; int TotalPoints; ComputeBuffer pointComputeBuffer; ComputeBuffer drawCallArgsComputeBuffer; [StructLayout(LayoutKind.Sequential)] struct ComputeBufferPointData { public const int size = sizeof(float) * 3 + sizeof(float) * 3 + sizeof(float) * 3 + sizeof(float) * 2 + sizeof(float) * 1; public Vector3 centerPosition; public Vector3 startingPosition; public Vector3 color; public float uv; public float scale; } [StructLayout(LayoutKind.Sequential)] struct DrawCallArgBuffer { public const int size = sizeof(int) + sizeof(int) + sizeof(int) + sizeof(int); public int vertexCountPerInstance; public int instanceCount; public int startVertexLocation; public int startInstanceLocation; } // Use this for initialization void Start () { TotalPoints = image.width * image.height; // pointComputeBuffer will be contain all individual points we wish to render. pointComputeBuffer = new ComputeBuffer(TotalPoints, ComputeBufferPointData.size); // Generate the draw call args on the GPU so we can make an indirect draw call. // This isn't the best example of using an indirect drawcall because we can // easily calculate how many points we need based on the image's width and height. // However this example will demonstrate how to do it anyways and at the very least, // we will save a little on bandwidth not sending the draw call parameters from the CPU to the GPU. drawCallArgsComputeBuffer = new ComputeBuffer(1, DrawCallArgBuffer.size, ComputeBufferType.IndirectArguments); // Find our compute shader kerenel and populate it with our input/output buffers. // pointComputeBuffer and drawcallArgsComputeBuffer will be our ouput from the // compute shader to the rasterization shader. int kernelId = computeShader.FindKernel("CSMain"); computeShader.SetVector("ScreenSize", new Vector4(Screen.width, Screen.height, Screen.height / Screen.width, 1)); computeShader.SetTexture(kernelId, "InputPointClampImage", image); computeShader.SetTexture(kernelId, "InputPointClampColorNoise", colorNoise); computeShader.SetBuffer(kernelId, "PointData", pointComputeBuffer); computeShader.SetBuffer(kernelId, "DrawCallArgs", drawCallArgsComputeBuffer); computeShader.Dispatch(kernelId, image.width/8, image.height/8, 1); } private void OnDestroy() { pointComputeBuffer.Release(); drawCallArgsComputeBuffer.Release(); } void OnPostRender() { // Use the geometry shader and bind the compute buffer data to the shader // as a structured buffer resource that can be indexed via the current instancce id. geometryMaterial.SetPass(0); geometryMaterial.SetBuffer("_ComputeBufferData", pointComputeBuffer); geometryMaterial.SetTexture("_NoiseTexture", colorNoise); //Graphics.DrawProcedural(MeshTopology.Points, 1, TotalPoints); Graphics.DrawProceduralIndirect(MeshTopology.Points, drawCallArgsComputeBuffer); } } |
ImageComputeShader.shader
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
// Each #kernel tells which function to compile; you can have many kernels #pragma kernel CSMain Texture2D InputPointClampImage; SamplerState samplerInputPointClampImageSampler; Texture2D InputPointClampColorNoise; SamplerState samplerInputPointClampColorNoiseSampler; float4 ScreenSize; struct ComputeBufferData { float3 centerPosition; float3 startingPosition; float3 color; float2 uv; float scale; }; // A structured buffer allows us to have a list or array // of custom buffer data that we can index. RWStructuredBuffer PointData; struct DrawCallArgsBuffer { uint vertexCountPerInstance; uint instanceCount; uint startVertexLocation; uint startInstanceLocation; }; RWStructuredBuffer DrawCallArgs; [numthreads(8,8,1)] void CSMain (uint3 id : SV_DispatchThreadID) { uint width, height, levels; float fWidth, fHeight; InputPointClampImage.GetDimensions(0, width, height, levels); float4 imageSize = float4(width, height, width/2.0, height/2.0); // Calculate what the scale should be based on the image width // because we don't want the image to be larger than the screen size. float scale = (ScreenSize.x/imageSize.x) * 0.001; float2 uv = float2(id.x/imageSize.x, id.y/imageSize.y); // Generate a noise value to compute each pixel's startingPoint. // We are going to use this to animation each pixel. float3 noise = InputPointClampColorNoise.SampleLevel(samplerInputPointClampColorNoiseSampler, uv, 0).rgb * 2.0 - 1.0; noise.x *= 1.5; noise.y = lerp(1.0, 2.0, abs(sin(noise.y))) * 2.0; int index = (id.y * imageSize.x) + id.x; PointData[index].centerPosition = float3((id.x * scale) - (imageSize.z * scale), (id.y * scale) - (imageSize.w * scale), 1); PointData[index].startingPosition = noise.xyz; PointData[index].color = InputPointClampImage.SampleLevel(samplerInputPointClampImageSampler, uv, 0).rgb; PointData[index].uv = uv; PointData[index].scale = scale; // Setup our draw call arguments so we can use DrawIndirect to submit our draw call. // This isn't the best use of DrawIndriect but it will save us from having to pass this // information again from the CPU to the GPU. if ((id.x + id.y) == 0) { DrawCallArgs[0].vertexCountPerInstance = 1; DrawCallArgs[0].instanceCount = width * height; DrawCallArgs[0].startVertexLocation = 0; DrawCallArgs[0].startInstanceLocation = 0; } } |
GeometryShaderExample.shader
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
Shader "Unlit/GeometryShaderExample" { Properties { } SubShader { Tags { "RenderType"="Opaque" } Pass { CGPROGRAM #pragma vertex vert #pragma geometry geom #pragma fragment frag #pragma enable_d3d11_debug_symbols #pragma target 5.0 #include "UnityCG.cginc" struct v2g { float4 position : SV_POSITION; float3 startingPosition : TEXCOORD0; float3 color : TEXCOORD1; float2 uv : TEXCOORD2; float scale : TEXCOORD3; }; struct g2f { float4 position : SV_POSITION; float3 color : TEXCOORD0; float3 uv : TEXCOORD1; }; struct ComputeBufferData { float3 centerPosition; float3 startingPosition; float3 color; float2 uv; float scale; }; // A structured buffer allows us to have a list or array // of custom buffer data that we can index. StructuredBuffer _ComputeBufferData; v2g vert (uint instanceId : SV_InstanceID) { v2g o; // Index our compute buffer data via the current instance id // since we used an instanced draw call. ComputeBufferData computeBufferData =_ComputeBufferData[instanceId]; o.position = UnityObjectToClipPos(computeBufferData.centerPosition); o.startingPosition = computeBufferData.startingPosition; o.color = computeBufferData.color; o.uv = computeBufferData.uv; o.scale = computeBufferData.scale; return o; } [maxvertexcount(6)] void geom( point v2g i[1], inout TriangleStream triangleStream) { g2f o; // Make sure we multiply our scale's x component by the aspect ratio // so our quad won't be squashed. const float aspectRatio = _ScreenParams.y / _ScreenParams.x; float2 scale = float2(i[0].scale * aspectRatio, i[0].scale); // Currently, the point's position is in clip space. I would like to get it // in normalized device coordinate space which will be between -1 and 1 for the x and y components. // This makes the space a little easier. So we need to divde each component by w. const float OneOverW = 1.0 / i[0].position.w; // Interpolate using a hermite curve. // This will make the fall apart animation a bit smoother. float t = sin(_Time.y) * 0.5 + 0.5; t = (-2*t*t*t) + (3*t*t); float3 position = lerp(i[0].startingPosition.xyz, i[0].position.xyz, t); float3 ndc = float3(position.x * OneOverW, position.y * OneOverW, i[0].position.z * OneOverW); // We will generate 2 triangles to build up our quad. // We will add a scale offset and then multiply by w to put the position back into // clip space as that is the space that the fragment shader expects the position to be in. // I will also procedurally generate uvs for each vertex so that we can take advantage of // uvs in the fragement shader. // Left Triangle o.position = float4((ndc + float3(scale.x, -scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); o.position = float4((ndc + float3(-scale.x, scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); o.position = float4((ndc + float3(-scale.x, -scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); triangleStream.RestartStrip(); // Right Triangle o.position = float4((ndc + float3(-scale.x, scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); o.position = float4((ndc + float3(scale.x, -scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); o.position = float4((ndc + float3(scale.x, scale.y, 0)) * i[0].position.w, i[0].position.w); o.color = i[0].color; o.uv = float3(i[0].uv, t); triangleStream.Append(o); triangleStream.RestartStrip(); } sampler2D _NoiseTexture; fixed4 generateEyeGlow(float2 suv, float2 position, float3 noiseValue, float scale, fixed4 color) { float noisyAmp = lerp(noiseValue.x, noiseValue.y, sin(_Time.y * 5.0) * 0.5 + 0.5) * scale; float particleShape = pow(1.0 / (length(suv + position) * noisyAmp), 0.9); fixed4 finalColor = color; finalColor.rgb *= particleShape; return finalColor; } fixed4 frag (g2f i) : SV_Target { // To show off the uvs we procedurally generated in the geometry shader, // we will make the quad fade from top to bottom. //fixed4 col = fixed4(i.uv.yyy,1) * float4(i.color.rgb, 1.0); float3 noiseValue = tex2D(_NoiseTexture, i.uv).rgb; float2 suv = i.uv * 2.0 - 1.0; // Give Fogerty some glowing eyes! fixed4 finalColor = lerp(float4(0,0,0,1), float4(i.color.rgb, 1.0), i.uv.z); finalColor += generateEyeGlow(suv, float2(0.17, -0.3), noiseValue, 25.0, fixed4(0.2, 0.4, 0.8, 1.0)); finalColor += generateEyeGlow(suv, float2(-0.17, -0.3), noiseValue, 25.0, fixed4(0.2, 0.4, 0.8, 1.0)); return finalColor; } ENDCG } } } |