Godot Version
Godot 4.2.1 Mono
Question - Compute Shader Returning Unpredictable NaN
Context:
I am writing this compute shader for some 3D boids. This is the first compute shader I’ve written, so I’m by no means an expert, but I have written some very basic OpenGL GLSL before. This is a naive first iteration that I am just trying to get working, but guidance on optimisation is welcome.
Details:
The plan is to have dynamic values for the boids that can change during runtime. This includes cohesion allignment and seperation radii and scalars, and the number of boids.
There are lists for position and velocity. The compute shader takes in these lists and returns the velocity, as the new positions will be set with the returned velocity, the position data doesn’t need to be returned. So: [input_velocities] + [input_positions] → Compute → [output_velocities].
These velocities are then fed to the boids, the boids return there new positions.
Code Snippets:
Listing 1
Skipping the boilerplate of setting up shader, the gdscript invoking the compute shader looks like so:
func run_compute() -> void:
params_data = PackedFloat32Array([
float(boid_num),
cohesion_radius,
cohesion_factor,
seperation_radius,
seperation_factor,
allignment_radius,
allignment_factor
])
//Clear buffers from previous loop
rd.buffer_clear(position_buffer, 0, pos_size_in_bytes)
rd.free_rid(position_buffer)
rd.buffer_clear(velocity_buffer, 0, vel_size_in_bytes)
rd.free_rid(velocity_buffer)
# Create properly sized velocities PackedVector3Array for buffer
input_velocities.clear()
input_velocities.resize(boid_num)
rd.buffer_clear(params_buffer, 0, params_size_in_bytes)
rd.free_rid(params_buffer)
# New buffers - Save sizes of lists in this iteration for buffer clearing
# next iteration
position_buffer = gen_vec3_buffer(input_positions)
pos_size_in_bytes = input_positions.to_byte_array().size()
velocity_buffer = gen_vec3_buffer(input_velocities)
vel_size_in_bytes = input_velocities.to_byte_array().size()
params_buffer = gen_float_buffer(params_data)
params_size_in_bytes = params_data.to_byte_array().size()
# New uniforms
pos_uniform = generate_uniform(position_buffer,
RenderingDevice.UNIFORM_TYPE_STORAGE_BUFFER, 0)
vel_uniform = generate_uniform(velocity_buffer,
RenderingDevice.UNIFORM_TYPE_STORAGE_BUFFER, 1)
params_uniform = generate_uniform(params_buffer,
RenderingDevice.UNIFORM_TYPE_STORAGE_BUFFER, 2)
var boid_uniform_set : RID = rd.uniform_set_create([pos_uniform, vel_uniform,
params_uniform], shader, 0)
# Defining compute list and dispatch
var compute_list := rd.compute_list_begin()
rd.compute_list_bind_compute_pipeline(compute_list, pipeline)
rd.compute_list_bind_uniform_set(compute_list, boid_uniform_set, 0)
# 16 is the minimum BOIDS amount as local_size_x = 16 (for now)
rd.compute_list_dispatch(compute_list, ceil(num_boids / 16), 1, 1)
rd.compute_list_end()
# Submit to GPU and wait for sync
rd.submit()
rd.sync()
# Read back the data from the buffer
var output_velocities_bytes : PackedByteArray =
rd.buffer_get_data(velocity_buffer)
var output_velocities_floats : PackedFloat32Array =
output_velocities_bytes.to_float32_array()
var output_velocities_vectors : PackedVector3Array =
float32_arr_to_vec3_arr(output_velocities_floats)
input_velocities.clear()
input_velocities.append_array(output_velocities_vectors)
Listing 2
The helper functions used in Listing 1 above:
func gen_vec3_buffer(data : PackedVector3Array) -> RID:
var data_buffer_bytes : PackedByteArray = data.to_byte_array()
var data_buffer : RID = rd.storage_buffer_create(data_buffer_bytes.size(),
data_buffer_bytes)
return data_buffer
func gen_float_buffer(data : PackedFloat32Array) -> RID:
var data_buffer_bytes : PackedByteArray = data.to_byte_array()
var data_buffer : RID = rd.storage_buffer_create(data_buffer_bytes.size(),
data_buffer_bytes)
return data_buffer
func generate_uniform(id: RID, type, binding: int) -> RDUniform:
var uniform : RDUniform = RDUniform.new()
uniform.uniform_type = type
uniform.binding = binding
uniform.add_id(id) return uniform
func float32_arr_to_vec3_arr(float_array: PackedFloat32Array) -> PackedVector3Array:
var vec3_array : PackedVector3Array = []
vec3_array.resize(float_array.size() / 3) #Each Vec3 consists of 3 floats
for i in range(vec3_array.size()):
var x = float_array[(i * 3)]
var y = float_array[(i * 3) + 1]
var z = float_array[(i * 3) + 2]
vec3_array[i] = Vector3(x, y, z)
return vec3_array
Listing 3
The GLSL compute shader:
#[compute]
#version 450
layout(local_size_x = 16, local_size_y = 1, local_size_z = 1) in;
layout(set = 0, binding = 0, std430) restrict buffer Position {
vec3 data[];
} boid_position;
layout(set = 0, binding = 1, std430) restrict buffer Velocity {
vec3 data[];
} boid_velocity;
layout(set = 0, binding = 2, std430) restrict buffer Params {
float num_boids;
float cohesion_radius;
float cohesion_factor;
float seperation_radius;
float seperation_factor;
float allignment_radius;
float allignment_factor;
} params;
void main() {
int my_index = int(gl_GlobalInvocationID.x);
vec3 my_pos = boid_position.data[my_index];
vec3 my_vel = boid_velocity.data[my_index];
vec3 cohesion_vec = vec3(0.0,0.0,0.0);
vec3 seperation_vec = vec3(0.0,0.0,0.0);
vec3 allignment_vec = vec3(0.0,0.0,0.0);
int num_local_boids = 0;
int avoids = 0;
for (int i = 0; i < int(params.num_boids); i++)
{
//If not self
if (i != my_index)
{
//Define vars for boid2 (b2)
vec3 b2_pos = boid_position.data[i];
vec3 b2_vel = boid_velocity.data[i];
float dist = distance(my_pos, b2_pos);
//Coherence - steer to center of near boids
if (dist <= params.cohesion_radius)
{
num_local_boids += 1;
cohesion_vec += b2_pos;
//Steer to face the direction of near boids
if (dist <= params.allignment_radius)
{
allignment_vec += b2_vel;
}
//Seperation - steer away from near boids
if (dist <= params.seperation_radius)
{
avoids += 1;
seperation_vec += (my_pos - b2_pos);
}
}
}
}
if (num_local_boids > 0)
{
//Add average position (center of nearby boids)
cohesion_vec *= 1.0 / float(num_local_boids);
cohesion_vec = normalize(cohesion_vec) * params.cohesion_factor;
my_vel += cohesion_vec;
//Add average velocity of nearby boids
allignment_vec *= 1.0 / float(num_local_boids);
allignment_vec = normalize(allignment_vec) * params.allignment_factor;
my_vel += allignment_vec;
//Adding seperation vec
if (avoids > 0)
{
seperation_vec = normalize(seperation_vec) * params.seperation_factor;
my_vel += seperation_vec;
}
}
//Return to input_velocities buffer
boid_velocity.data[my_index] = my_vel;
}
Debug info:
When printing output_velocities_bytes and output_velocities_floats from Listing 1, I could see the bytes [255, 255, 255, 127] resulted in floats NaN.
I found that [255, 255, 255, 127] → NaN → 0xFFFFFF7F by IEEE 754 signals “invalid operation or result”. Could’ve guessed that much but someone more informed that me might be able to gleam something from that. They seemed to occur relatively randomly. Once grouped into vec3’s, the same vector can contain components of NaN and 0. e.g. Vector3(nan, nan, 0).
I’ve been at this for a while and I’m not the most clever cookie, so I suspect I’ve overlooked something simple. If more details are required, ask away