Basilisk CFD
Adaptive Cartesian mesh PDE framework
Loading...
Searching...
No Matches
grid.h File Reference
#include <khash.h>
#include "reduction.h"
Include dependency graph for grid.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  MyUniform
 
struct  Shader
 
struct  kh_INT_s
 
struct  GridGPU
 

Macros

#define gpu_grid   ((GridGPU *)grid)
 
#define str_append(dst, ...)   str_append_array (dst, (const char *[]){__VA_ARGS__, NULL})
 
#define xstr(a)   str(a)
 
#define str(a)   #a
 
#define IS_EXTERNAL_CONSTANT(g)   ((g)->constant && (g)->type == sym_INT && !(g)->data)
 
#define EXTERNAL_NAME(g)   (g)->global == 2 ? "_loc_" : "", (g)->name, (g)->reduct ? "_in_" : ""
 
#define reset(...)   reset_gpu (__VA_ARGS__)
 
#define init_grid(n)   gpu_init_grid(n)
 
#define free_grid()   gpu_free_grid()
 

Typedefs

typedef struct kh_INT_s kh_INT_t
 

Functions

static kh_INT_tkh_init_INT (void)
 
static void kh_destroy_INT (kh_INT_t *h)
 
static void kh_clear_INT (kh_INT_t *h)
 
static khint_t kh_get_INT (const kh_INT_t *h, khint32_t key)
 
static int kh_resize_INT (kh_INT_t *h, khint_t new_n_buckets)
 
static khint_t kh_put_INT (kh_INT_t *h, khint32_t key, int *ret)
 
static void kh_del_INT (kh_INT_t *h, khint_t x)
 
static charstr_append_array (char *dst, const char *list[])
 
static int list_size (const External *i)
 
static charwrite_scalar (char *fs, scalar s)
 
static charwrite_vector (char *fs, vector v)
 
static charwrite_tensor (char *fs, tensor t)
 
static void boundary_top (Point point, int i)
 
static void boundary_bottom (Point point, int i)
 
static void apply_bc (Point point)
 
static bool is_boundary_attribute (const External *g)
 
static void hash_external (Adler32Hash *hash, const External *g, const ForeachData *loop, int indent)
 
static uint32_t hash_shader (const External *externals, const ForeachData *loop, const RegionParameters *region, const char *kernel)
 
static bool is_void_function (char *code)
 
static chartype_string (const External *g)
 
trace charbuild_shader (External *externals, const ForeachData *loop, const RegionParameters *region, const GLuint nwg[2])
 
trace Shaderload_shader (const char *fs, uint32_t hash, const ForeachData *loop)
 
void gpu_limits (FILE *fp)
 
void gpu_free ()
 
void gpu_init ()
 
void gpu_free_grid (void)
 
static trace void gpu_cpu_sync_scalar (scalar s, char *sep, GLenum mode)
 
static void gpu_cpu_sync (scalar *list, GLenum mode, const char *fname, int line)
 
trace void reset_gpu (void *alist, double val)
 
void gpu_init_grid (int n)
 
static Externalappend_external (External *externals, External **end, External *g)
 
static Externalmerge_external (External *externals, External **end, External *g, const ForeachData *loop)
 
static Externalmerge_externals (External *externals, const ForeachData *loop)
 
static trace Shadercompile_shader (ForeachData *loop, uint32_t hash, const RegionParameters *region, External *externals, const char *kernel)
 
static void free_reduction_fields (const External *externals)
 
static trace Shadersetup_shader (ForeachData *loop, const RegionParameters *region, External *externals, const char *kernel)
 
static bool doloop_on_gpu (ForeachData *loop, const RegionParameters *region, External *externals, const char *kernel)
 
bool gpu_end_stencil (ForeachData *loop, const RegionParameters *region, External *externals, const char *kernel)
 

Variables

bool on_cpu = false
 
static char glsl_preproc []
 
static scalarapply_bc_list
 
static int bc_period_x = -1
 
static int bc_period_y = -1
 
 attribute
 The stored attibute tracks where the up-to-date field is stored:
 
double(* boundary_right )(Point, Point, scalar, bool *)
 
double(* boundary_top )(Point, Point, scalar, bool *)
 
double(* boundary_bottom )(Point, Point, scalar, bool *)
 

Macro Definition Documentation

◆ EXTERNAL_NAME

#define EXTERNAL_NAME (   g)    (g)->global == 2 ? "_loc_" : "", (g)->name, (g)->reduct ? "_in_" : ""

Definition at line 789 of file grid.h.

◆ free_grid

#define free_grid (   void)    gpu_free_grid()

Definition at line 1404 of file grid.h.

◆ gpu_grid

#define gpu_grid   ((GridGPU *)grid)

Definition at line 389 of file grid.h.

◆ init_grid

#define init_grid (   n)    gpu_init_grid(n)

Definition at line 1402 of file grid.h.

◆ IS_EXTERNAL_CONSTANT

#define IS_EXTERNAL_CONSTANT (   g)    ((g)->constant && (g)->type == sym_INT && !(g)->data)

Definition at line 638 of file grid.h.

◆ reset

#define reset (   ...)    reset_gpu (__VA_ARGS__)

Definition at line 1388 of file grid.h.

◆ str

#define str (   a)    #a

Definition at line 406 of file grid.h.

◆ str_append

#define str_append (   dst,
  ... 
)    str_append_array (dst, (const char *[]){__VA_ARGS__, NULL})

Definition at line 404 of file grid.h.

◆ xstr

#define xstr (   a)    str(a)

Definition at line 405 of file grid.h.

Typedef Documentation

◆ kh_INT_t

Function Documentation

◆ append_external()

static External * append_external ( External externals,
External **  end,
External g 
)
static

Definition at line 1408 of file grid.h.

◆ apply_bc()

static void apply_bc ( Point  point)
static

Definition at line 571 of file grid.h.

◆ boundary_bottom()

static void boundary_bottom ( Point  point,
int  i 
)
static

Definition at line 559 of file grid.h.

◆ boundary_top()

static void boundary_top ( Point  point,
int  i 
)
static

Definition at line 548 of file grid.h.

◆ build_shader()

trace char * build_shader ( External externals,
const ForeachData loop,
const RegionParameters region,
const GLuint  nwg[2] 
)

Scalar field attributes

Field offsets when using multiple SSBOs

Non-local variables

'int nl' gets special treatment.

'coord p' is assumed to be the parameter of a region. This is not flexible (the parameter must be called 'p') and should be improved.

Definition at line 792 of file grid.h.

◆ compile_shader()

static trace Shader * compile_shader ( ForeachData loop,
uint32_t  hash,
const RegionParameters region,
External externals,
const char kernel 
)
static

Number of compute shader work groups and groups

main()

Make list of uniforms

Definition at line 1509 of file grid.h.

◆ doloop_on_gpu()

static bool doloop_on_gpu ( ForeachData loop,
const RegionParameters region,
External externals,
const char kernel 
)
static

Render

If this is a foreach_point() iteration, we draw a single point

This is a region

Perform reductions and cleanup

Definition at line 1961 of file grid.h.

◆ free_reduction_fields()

static void free_reduction_fields ( const External externals)
static

Definition at line 1720 of file grid.h.

◆ gpu_cpu_sync()

static void gpu_cpu_sync ( scalar list,
GLenum  mode,
const char fname,
int  line 
)
static

Definition at line 1330 of file grid.h.

◆ gpu_cpu_sync_scalar()

static trace void gpu_cpu_sync_scalar ( scalar  s,
char sep,
GLenum  mode 
)
static

Definition at line 1296 of file grid.h.

◆ gpu_end_stencil()

bool gpu_end_stencil ( ForeachData loop,
const RegionParameters region,
External externals,
const char kernel 
)

Definition at line 2045 of file grid.h.

Referenced by foreach_stencil_generic().

Here is the caller graph for this function:

◆ gpu_free()

void gpu_free ( )

Definition at line 1184 of file grid.h.

◆ gpu_free_grid()

void gpu_free_grid ( void  )

Definition at line 1266 of file grid.h.

◆ gpu_init()

void gpu_init ( )

Definition at line 1214 of file grid.h.

◆ gpu_init_grid()

void gpu_init_grid ( int  n)

Definition at line 1390 of file grid.h.

◆ gpu_limits()

void gpu_limits ( FILE fp)

Definition at line 1173 of file grid.h.

◆ hash_external()

static void hash_external ( Adler32Hash hash,
const External g,
const ForeachData loop,
int  indent 
)
static

Definition at line 641 of file grid.h.

◆ hash_shader()

static uint32_t hash_shader ( const External externals,
const ForeachData loop,
const RegionParameters region,
const char kernel 
)
static

Definition at line 704 of file grid.h.

◆ is_boundary_attribute()

static bool is_boundary_attribute ( const External g)
static

Definition at line 628 of file grid.h.

◆ is_void_function()

static bool is_void_function ( char code)
static

Definition at line 758 of file grid.h.

◆ kh_clear_INT()

static void kh_clear_INT ( kh_INT_t h)
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_del_INT()

static void kh_del_INT ( kh_INT_t h,
khint_t  x 
)
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_destroy_INT()

static void kh_destroy_INT ( kh_INT_t h)
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_get_INT()

static khint_t kh_get_INT ( const kh_INT_t h,
khint32_t  key 
)
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_init_INT()

static kh_INT_t * kh_init_INT ( void  )
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_put_INT()

static khint_t kh_put_INT ( kh_INT_t h,
khint32_t  key,
int ret 
)
inlinestatic

Definition at line 381 of file grid.h.

◆ kh_resize_INT()

static int kh_resize_INT ( kh_INT_t h,
khint_t  new_n_buckets 
)
inlinestatic

Definition at line 381 of file grid.h.

◆ list_size()

static int list_size ( const External i)
inlinestatic

Definition at line 487 of file grid.h.

◆ load_shader()

trace Shader * load_shader ( const char fs,
uint32_t  hash,
const ForeachData loop 
)

Definition at line 1130 of file grid.h.

Referenced by gpu_reduction().

Here is the caller graph for this function:

◆ merge_external()

static External * merge_external ( External externals,
External **  end,
External g,
const ForeachData loop 
)
static

Check whether a local g (resp. i) shadows a global i (resp g*).

Definition at line 1419 of file grid.h.

◆ merge_externals()

static External * merge_externals ( External externals,
const ForeachData loop 
)
static

Definition at line 1462 of file grid.h.

◆ reset_gpu()

trace void reset_gpu ( void alist,
double  val 
)

Definition at line 1359 of file grid.h.

◆ setup_shader()

static trace Shader * setup_shader ( ForeachData loop,
const RegionParameters region,
External externals,
const char kernel 
)
static

We will directly apply boundary conditions to fields marked 'dirty' by automatic stencils.

We also apply boundary stencils so that input/output are also set properly for boundary conditions which may use external fields.

We make sure all fields marked dirty are also outputs.

Allocate reduction fields

Reuse or compile a new shader

Apply boundary conditions

This can be required if boundary conditions have been modified between loops.

For the Intel driver, it looks like the next line is necessary to ensure proper synchronisation of the compute shader and fragment shader (for example when using output_ppm() for interactive display). The nvidia driver somehow does not need this...

Set uniforms

Definition at line 1730 of file grid.h.

◆ str_append_array()

static char * str_append_array ( char dst,
const char list[] 
)
static

Definition at line 391 of file grid.h.

◆ type_string()

static char * type_string ( const External g)
static

Definition at line 767 of file grid.h.

◆ write_scalar()

static char * write_scalar ( char fs,
scalar  s 
)
static

Definition at line 510 of file grid.h.

◆ write_tensor()

static char * write_tensor ( char fs,
tensor  t 
)
static

Definition at line 534 of file grid.h.

◆ write_vector()

static char * write_vector ( char fs,
vector  v 
)
static

Definition at line 524 of file grid.h.

Variable Documentation

◆ apply_bc_list

scalar* apply_bc_list
static

Definition at line 544 of file grid.h.

◆ attribute

attribute
Initial value:
{
int x
Definition common.h:76
Definition linear.h:21

The stored attibute tracks where the up-to-date field is stored:

0: on both the CPU and GPU (i.e. synchronized). 
1: on the CPU.
  • 1: on the GPU.

Definition at line 1274 of file grid.h.

◆ bc_period_x

int bc_period_x = -1
static

Definition at line 546 of file grid.h.

◆ bc_period_y

int bc_period_y = -1
static

Definition at line 546 of file grid.h.

◆ boundary_bottom

double(* boundary_bottom) (Point, Point, scalar, bool *) ( Point  ,
Point  ,
scalar  ,
bool  
)

Definition at line 1278 of file grid.h.

◆ boundary_right

double(* boundary_right) (Point, Point, scalar, bool *) ( Point  ,
Point  ,
scalar  ,
bool  
)

Definition at line 1276 of file grid.h.

◆ boundary_top

double(* boundary_top) (Point, Point, scalar, bool *) ( Point  ,
Point  ,
scalar  ,
bool  
)

Definition at line 1277 of file grid.h.

◆ glsl_preproc

char glsl_preproc[]
static

Definition at line 408 of file grid.h.

Referenced by gpu_reduction().

◆ on_cpu

bool on_cpu = false

Grids on GPUs

The files in this directory implement Cartesian and Multigrid grids on Graphics Processing Units (GPUs). The ultimate goal is to allow running any Basilisk solver on GPUs without any modification to the original source code.

To do so the Basilisk preprocessor automatically generates "computation kernels" for each loop iterator. These kernels are then dynamically compiled (at runtime) by the OpenGL Shading Language (GLSL) compiler which is part of the (OpenGL) graphics card driver. If compilation is successful, the corresponding loop is performed on the GPU, otherwise the CPU is used. If this hybrid GPU/CPU hybrid mode of operation is used, synchronisation between the GPU and CPU memory is necessary and is done automatically.

OpenGL is an open standard (unlike e.g. CUDA) and is widely supported by graphics cards (with the notable exception of Apple graphics cards and some high-end "professional" Nvidia cards).

Running on GPUs

As described above, from a Basilisk perspective GPUs are just another type of grid. Selecting a "GPU grid" can simply be done using either

#include "grid/gpu/multigrid.h"

in the source code, or using the -grid command line option of qcc like this

qcc -autolink -Wall -O2 -grid=gpu/multigrid code.c -o code -lm

The standard Basilisk Makefile also includes the handy recipe

make code.gpu.tst

which will compile and run code.c using the gpu/multigrid grid.

Note that for all this to work properly you first need to install the Basilisk GPU libraries.

Installation

Basilisk uses the GLFW library to configure and access the graphics card and OpenGL (version >= 4.3) for the rest. These libraries and the associated Basilisk libraries can be easily installed on Debian-like systems using

sudo apt install libglfw3-dev
cd \f$BASILISK/grid/gpu
make

Note that you will also need the appropriate graphics card drivers (often proprietary for Nvidia). Note also that (reasonably high-end) laptop computers often have two graphics cards: a low-power, slow one and a high-power, fast one. To check which one you are currently using you can use something like

sudo apt install mesa-utils
glxinfo -B

On my Dell XPS laptop I can switch to the (proprietary driver of the) fast Nvidia graphics card using

__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia glxinfo -B

Tests

There are several test cases for GPUs you can try. For example

cd \f$BASILISK/test
CFLAGS=-DPRINTNSHADERS make gpu.gpu.tst

If this worked, you can then try a more interesting example

CFLAGS='-DSHOW' make bump2D-gpu.tst
__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia ./bump2D-gpu/bump2D-gpu 10

and also

cd $BASILISK/examples
CFLAGS='-DSHOW -DBENCHMARK' make turbulence.gpu.tst
__NV_PRIME_RENDER_OFFLOAD=1 __GLX_VENDOR_LIBRARY_NAME=nvidia ./turbulence.gpu/turbulence.gpu 1024

Writing code compatible with GPUs

GPUs are fast compared to CPUs because they use specialised hardware which relies on highly-parallel (tens of thousands of execution threads) asynchronous accesses to fast video memory channels. This imposes strong constraints on programs which can run efficiently on these systems, in particular regarding memory allocation and accesses. These constraints are reflected in the programming languages usable on GPUs, for example the OpenGL Shading Language (GLSL) which underlies the GPU grid in Basilisk.

GLSL is mostly a subset of C99 and the good news is that this subset happens to be what is used within most foreach loops in Basilisk (this is not a coincidence...). Thus, in many cases, simple and efficient Basilisk code will also run transparently and efficiently on GPUs.

GPU/CPU hybrid mode

There are obvious cases where foreach loops will not run on GPUs (see the next section). In theses cases, Basilisk will automatically switch to running the loop on the CPU and will synchronize the CPU and GPU memories. Note that this also means that the memory (i.e. scalar fields etc.) for a program is always allocated twice: once on the CPU and once on the GPU.

As an example, consider the following simple code

int main()
{
init_grid (16);
scalar s[];
double k = 2.;
for (int _i = 0; _i < _N; _i++) /* foreach */
s[] = cos(k*x)*sin(k*y);
for (int _i = 0; _i < _N; _i++) /* foreach */
printf ("%g %g %g\n", x, y, s[]);
}

this can be run on the CPU using e.g.

make test.tst

If we now run on the GPU using

make test.gpu.tst

we get

test.gpu.c:9: GLSL: error: unknown function 'printf'
test.gpu.c:8: warning: for (int _i = 0; _i < _N; _i++) /* foreach */ done on CPU (see GLSL errors above)

Basilisk warns us that "printf" is not known in GLSL (at line 9) and that, as a consequence, the loop at line 8 (i.e. the second loop which includes "printf") was run on the CPU. Note that the first message is a "GLSL: error" but that the code still ran OK on the CPU. Note also that this error happened at runtime and not during compilation. That's because foreach loops are compiled dynamically at runtime by the graphics card driver.

Since GPUs have a very limited access to the operating system (i.e. only through the OpenGL interface) we cannot expect the loop including "printf" (or any other output) to run on the GPU. Note also that the second loop should be "serial" rather than parallel (see Parallel Programming). So we need to modify the code to

...
foreach (serial)
printf ("%g %g %g\n", x, y, s[]);
...

If we now recompile and run with make test.gpu.tst, the GLSL error and warnings are gone since we explicitely specified that the second loop should run on the CPU (and in serial mode).

Another way to specify that a given loop should run on the CPU (either in serial or parallel mode) is to use

foreach (cpu)
...

Similarly one could use foreach (gpu) to force running on the GPU, in which case the GLSL warning above would become an error. This can be useful when debugging GPU codes and used in combination with the -cpu compilation flag which will force loops to run on the CPU by default.

Variable-size arrays

In C99 variable-size arrays can be defined simply using for example

void func1 (int n, double a[n]) {
...
}
...
{
int m = ...;
double b[m];
func (m, b);
}
const vector a
Definition all-mach.h:59
define m((k)==0 &&(l)==0 &&(m)==0) macro2 foreach_point(double _x=0.
define sysmalloc malloc define syscalloc calloc define sysrealloc realloc define sysfree free define systrdup strdup define func
Definition config.h:120
else return n
Definition curvature.h:101
size *double * b

Since this relies on dynamic memory allocation on the stack, this is not possible in general in GLSL. The only cases where this will work is if the size of the array can be computed "statically" i.e. at the time the GLSL kernel is compiled. Furthermore, the GLSL compiler is strict (or not very clever) and a code looking like

int n = 3;
double a[n];

will fail with an error like

GLSL: error: array size must be a constant valued expression

To fix this one needs to write instead

const int n = 3;
double a[n];

Note that the size of the array must be a constant, but only at the time when the GLSL kernel is compiled. This allows using variable-size arrays also in GLSL, provided their size is constant within the kernel. For example the following code will work fine on the GPU, even if n changes between calls to func2().

void func2 (const int n) {
for (int _i = 0; _i < _N; _i++) /* foreach */ {
const int size = n + 1;
double a[size];
...
}
}

Finally, using variable-sized arrays as function parameters, as done in func1() above, is not allowed in GLSL. To work around this strong limitation, the kernel preprocessor will expand calls to functions using variable-size arrays (using the macro engine). Note that this means that the function must respect the constraints applying to macros, in particular they can return only at the end of the function.

What cannot be done on GPUs

Inputs/Outputs: The only possible direct output on GPUs is the screen (see output_ppm on GPUs). All other inputs or outputs must go through the CPU. Complex memory allocation and access: There is no notion of "memory stack" on GPUs, all memory requirements are static and must be defined at compilation time. This means that variable/dynamical arrays, dynamic memory allocation (malloc etc.) and pointers do not exist on GPUs (and in GLSL). Using any of these in a foreach loop will give you a GLSL error as above. Limited support for function pointers: function pointers are fundamentally different from memory pointers. Basilisk includes limited support for function pointers i.e. what is sufficient for field attributes* implementing custom boundary conditions or coarsening/refinement functions. Using external libraries: GPUs cannot (obviously) use functions defined in external (CPU) libraries.

Current limitations

Aside from the fundamental constraints above, the current implementation also has the following limitations, some of which will be lifted in the (not-too-distant) future. In rough order of "lifting priority" these are:

Only 2D Cartesian and Multigrid grids for now: 3D multigrid will follow easily, quadtrees and octrees are more difficult. The maximum size of any scalar field is limited to what can be indexed using a 32-bits unsigned integer i.e. 2^32^ floats or 16 GB. Boundary conditions have only been implemented for 3x3 stencils. At this stage only a few solvers have been tested. Other solvers may or may not work. In particular surface tension will not work yet because the estimation of curvature relies on code which is not portable to GPUs. Only simple external types (int, bool, float, double, coord etc.) are currently supported: for example custom typedefs are not supported yet for external variables. Loop-local variables and functions can use (locally-defined) custom types. Loop-local lists of scalars have very limited support (but are not used much anyway): external loops support is OK. Double precision (64-bits floats) is supported by Basilisk (use ‘CFLAGS=’-DDOUBLE_PRECISION'`) but depends on the (often limited) support by graphics cards and their drivers. Note also that using single precision can have an important impact on the convergence and accuracy of multigrid solvers.

Performance

To convince yourself that GPUs are worth the trouble, see the GPUBenchmarks": speedups of one to two orders of magnitude compared to CPUs are achievable. To maximize performance, here are a few tips and observations: Make sure that you are using the correct graphics card and driver (see @ref "installation" "glxinfo" above). GPUs are highly parallel so will only provide speedups for large enough simulations (e.g. larger than 128^2^), increasingly so as resolution is increased. Frequent CPU/GPU memory synchronisation will kill performance. Be careful to control how often you output data for example, much more so than when running on CPUs. An exception is <a href="output.h" target="_blank" >graphical outputs</a> which are much cheaper on GPUs and can be done often with little overhead. Loops done on the CPU within e.g. timestep iterations will generally kill performance. Use <a href="/src/README.trace" target="_blank" >built-in profiling</a> to check where time is spent. Use the <tt>-DTRACE=3</tt> compilation flag to get profiling information at the level of foreach loops. @subsection autotoc_md149 Bugs Trying to use more than one shader storage buffer (SSBO) with Intel drivers does not seem work. This limits the amount of available video memory to a single SSBO with a maximum size which is usually 2GB (or less). @subsection autotoc_md150 See also <a href="/src/test/READMEgpu-tests" target="_blank" >Test cases on GPUs</a> @ref "/home/runner/work/basilisk-docs/basilisk-docs/basilisk/src/grid/gpu/Benchmarks.md" "GPU benchmarks" <a href="/src/ast/kernels.c" target="_blank" >Computation kernels

Implementation

Definition at line 367 of file grid.h.