792char * build_shader (External * externals, const ForeachData * loop,
793 const RegionParameters * region, const GLuint nwg[2])
795 int Nl = region->level > 0 ? 1 << (region->level - 1) : N/Dimensions.x;
797 snprintf (s, 19, "%d", nconst > 0 ? nconst : 1);
799 snprintf (a, 19, "%g", nconst > 0 ? _constant[0] : 0);
800 char * fs = str_append (NULL, "#version 430\n", glsl_preproc,
801 "const int _nconst = ", s, ";\n"
802 "const real _constant[_nconst] = {", a);
803 for (int i = 1; i < nconst; i++) {
804 snprintf (a, 19, "%g", _constant[i]);
805 fs = str_append (fs, ",", a);
807 fs = str_append (fs, "};\n"
808 "layout(std430, binding = 0)"
809 " restrict buffer _data_layout { real f[]; } _data");
810 if (GPUContext.nssbo > 1) {
811 snprintf (a, 19, "%d", GPUContext.nssbo);
812 snprintf (s, 29, "%ld", GPUContext.max_ssbo_size/sizeof(real));
813 fs = str_append (fs, "[", a, "];\n"
814 "#define _data_val(field,index) _data[_offset[(field)].i+(_offset[(field)].j+(index))/", s,
815 "].f[(_offset[(field)].j+(index))%", s, "]\n");
818 fs = str_append (fs, ";\n"
819 "#define _data_val(field,index) _data.f[(field)*field_size() + (index)]\n");
824 char * attributes = NULL;
825 for (const External * g = externals; g; g = g->next)
826 if (g->name[0] == '.
') {
827 attributes = str_append (attributes, " ", type_string (g), " ", g->name + 1);
828 for (int * d = g->data; d && *d > 0; d++) {
829 char s[20]; snprintf (s, 19, "%d", *d);
830 attributes = str_append (attributes, "[", s, "]");
832 attributes = str_append (attributes, ";\n");
836 fs = str_append (fs, "struct _Attributes {\n", attributes, "};\n");
837 sysfree (attributes);
839 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
843 char s[20]; snprintf (s, 19, "%d", nindex);
844 fs = str_append (fs, "const _Attributes _attr[", s, "]={");
846 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
848 fs = str_append (fs, nindex ? ",{" : "{"); nindex++;
850 char * data = (char *) &_attribute[s.i];
851 for (const External * g = externals; g; g = g->next)
852 if (g->name[0] == '.
') {
853 if (!first) fs = str_append (fs, ",");
855 if (g->type == sym_INT) {
856 char s[20]; snprintf (s, 19, "%d", *((int *)(data + g->nd)));
857 fs = str_append (fs, s);
859 else if (g->type == sym_BOOL)
860 fs = str_append (fs, *((bool *)(data + g->nd)) ? "true" : "false");
861 else if (g->type == sym_FLOAT) {
862 char s[20]; snprintf (s, 19, "%g", *((float *)(data + g->nd)));
863 fs = str_append (fs, s);
865 else if (g->type == sym_DOUBLE) {
866 char s[20]; snprintf (s, 19, "%g", *((double *)(data + g->nd)));
867 fs = str_append (fs, s);
869 else if (g->type == sym_IVEC) {
870 ivec * v = (ivec *)(data + g->nd);
871 char s[20]; snprintf (s, 19, "{%d,%d}", v->x, v->y);
872 fs = str_append (fs, s);
874 else if (g->type == sym_function_declaration) {
875 void * func = *((void **)(data + g->nd));
877 fs = str_append (fs, "0");
879 External * ptr = _get_function ((long) func);
880 char s[20]; snprintf (s, 19, "%d", ptr->nd);
881 fs = str_append (fs, s);
884 else if (g->type == sym_SCALAR)
885 fs = write_scalar (fs, *((scalar *)(data + g->nd)));
886 else if (g->type == sym_VECTOR)
887 fs = write_vector (fs, *((vector *)(data + g->nd)));
888 else if (g->type == sym_TENSOR)
889 fs = write_tensor (fs, *((tensor *)(data + g->nd)));
891 fs = str_append (fs, "not implemented");
893 fs = str_append (fs, "}");
895 fs = str_append (fs, "};\n");
901 if (GPUContext.nssbo > 1) {
903 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
904 if (s.gpu.index && s.i + s.block > imax)
905 imax = s.i + s.block;
906 for (External * g = externals; g; g = g->next)
909 if (s.i + s.block > imax)
910 imax = s.i + s.block;
912 char string[100]; snprintf (string, 19, "%d", imax);
913 fs = str_append (fs, "const struct { uint i, j; } _offset[", string, "]={");
914 for (int s = 0; s < imax; s++) {
915 fs = str_append (fs, s ? ",{" : "{");
916 size_t offset = s*field_size(), size = GPUContext.max_ssbo_size/sizeof(real);
917 size_t i = offset/size, j = offset%size;
918 snprintf (string, 99, "%ld,%ld", i, j);
919 fs = str_append (fs, string, "}");
921 if (j + field_size() > 1L << 32) {
922 fprintf (stderr, "%s:%d: error: resolution is too high for 32-bits indexing\n",
927 fs = str_append (fs, "};\n");
933 for (External * g = externals; g; g = g->next) {
934 if (g->name[0] == '.
') {
935 if (g->type == sym_function_declaration) {
936 bool boundary = is_boundary_attribute (g);
937 fs = str_append (fs, "#define _attr_", g->name + 1, "(s,args) (");
938 foreach_function (f, f->used = false);
940 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
942 char * data = (char *) &_attribute[s.i];
943 void * func = *((void **)(data + g->nd));
945 External * f = _get_function ((long) func);
946 if (!f->used && (!boundary || (s.stencil.io & s_output))) {
948 char * args = is_void_function (f->data) ? " args,0" : " args";
950 expr = str_append (NULL, "(", f->name, args, ")");
953 snprintf (index, 19, "%d", f->nd);
954 char * s = str_append (NULL, "_attr(s,", g->name + 1, ")==", index,
955 "?(", f->name, args, "):", expr);
963 fs = str_append (fs, expr, ")\n");
967 fs = str_append (fs, "0)\n");
970 else if (g->type == sym_function_definition) {
971 External * f = _get_function ((long) g->pointer);
972 fs = str_append (fs, "\n", f->data, "\n");
973 char s[20]; snprintf (s, 19, "%d", f->nd);
974 fs = str_append (fs, "const int _p", g->name, " = ", s, ";\n");
976 else if (g->type == sym_function_declaration) {
977 External * f = _get_function ((long) g->pointer);
978 char s[20]; snprintf (s, 19, "%d", f->nd);
979 fs = str_append (fs, "const int ", g->name, " = ", s, ";\n",
980 "#define _f", g->name, "(args) (", f->name, " args)\n");
982 else if (g->type != sym_SCALAR &&
983 g->type != sym_VECTOR &&
984 g->type != sym_TENSOR) {
985 if (g->type == sym_INT && !strcmp (g->name, "N")) {
986 int level = region->level > 0 ? region->level - 1 : depth();
987 char s[20], d[20], l[20];
988 snprintf (s, 19, "%d", Nl);
989 snprintf (d, 19, "%d", depth());
990 snprintf (l, 19, "%d", level);
992 "const uint N = ", s, ", _depth = ", d, ";\n");
993 if (GPUContext.nssbo == 1) {
995 snprintf (size, 29, "%ld", (size_t) field_size());
997 "const uint _field_size = ", size, ";\n");
999#ifdef shift_level // multigrid
1000 fs = str_append (fs,
1001 "const uint _shift[_depth + 1] = {");
1002 for (int d = 0; d <= depth(); d++) {
1003 snprintf (s, 19, "%ld", shift_level(d));
1004 fs = str_append (fs, d > 0 ? "," : "", s);
1006 fs = str_append (fs, "};\n");
1007#endif // ifdef shift_level
1008 snprintf (s, 19, "{%d,%d}", Dimensions.x, Dimensions.y);
1009 fs = str_append (fs,
1010 "const ivec Dimensions = ", s, ";\n"
1011 "const uint NY = ", loop->face > 1 || loop->vertex ?
1012 "N*Dimensions.y + 1" : "N*Dimensions.y", ";\n");
1013 if (GPUContext.fragment_shader)
1014 fs = str_append (fs, "in vec2 vsPoint;\n"
1015 "Point point = {int((vsPoint.x*vsScale.x + vsOrigin.x)*N*Dimensions.x)"
1017 "int((vsPoint.y*vsScale.y + vsOrigin.y)*N*Dimensions.x) + GHOSTS,", l,
1019 ",ivec2(N*Dimensions.x,N*Dimensions.y)"
1027 "out vec4 FragColor;\n");
1029 char nwgx[20], nwgy[20];
1030 snprintf (nwgx, 19, "%d", nwg[0]);
1031 snprintf (nwgy, 19, "%d", nwg[1]);
1032 fs = str_append (fs, "layout (local_size_x = ", nwgx,
1033 ", local_size_y = ", nwgy, ") in;\n");
1036 else if (g->type == sym_INT && !strcmp (g->name, "nl")) {
1042 snprintf (nl, 19, "%d", *((int *)g->pointer));
1043 fs = str_append (fs, "const int nl = ", nl, ";\n");
1045 else if (g->type == sym_INT && !strcmp (g->name, "bc_period_x"))
1046 fs = str_append (fs, "const int bc_period_x = ", Period.x ?
1047 "int(N*Dimensions.x)" : "-1", ";\n");
1048 else if (g->type == sym_INT && !strcmp (g->name, "bc_period_y"))
1049 fs = str_append (fs, "const int bc_period_y = ", Period.y ?
1050 "int(N*Dimensions.y)" : "-1", ";\n");
1051 else if (GPUContext.fragment_shader && (region->n.x > 1 || region->n.y > 1) &&
1052 g->type == sym_COORD && !strcmp (g->name, "p")) {
1058 fs = str_append (fs, "coord p = vec3((vsPoint*vsScale + vsOrigin)*L0 + vec2(X0, Y0),0);\n");
1060 else if (IS_EXTERNAL_CONSTANT(g)) {
1061 // fixme: for the moment only 'const
int' are considered, this could be generalised
1063 assert (g->pointer);
1064 snprintf (value, 19, "%d", *((int *)g->pointer));
1065 fs = str_append (fs, "const ", type_string (g), " ", EXTERNAL_NAME (g), "=", value, ";\n");
1067 else if (strcmp (g->name, "Dimensions")) {
1068 char * type = type_string (g);
1069 fs = str_append (fs, "uniform ", type, " ", EXTERNAL_NAME (g));
1070 for (int * d = g->data; d && *d > 0; d++) {
1071 char s[20]; snprintf (s, 19, "%d", *d);
1072 fs = str_append (fs, "[", s, "]");
1074 fs = str_append (fs, ";\n");
1076 fs = str_append (fs, type, " ", g->global == 2 ? "_loc_" : "", g->name, " = ",
1077 EXTERNAL_NAME (g), ";\n");
1078 fs = str_append (fs, "const scalar ", g->name, "_out_ = ");
1079 fs = write_scalar (fs, g->s);
1080 fs = str_append (fs, ";\n");
1084 else { // scalar, vector and tensor fields
1085 int size = list_size (g);
1086 for (int j = 0; j < size; j++) {
1088 fs = str_append (fs, "const ", type_string (g), " ", EXTERNAL_NAME (g));
1090 fs = str_append (fs, " = ");
1092 char s[20]; snprintf (s, 19, "%d", size);
1093 fs = str_append (fs, "[", s, "] = {");
1096 if (g->nd == 0 || j < size - 1) {
1097 if (g->type == sym_SCALAR)
1098 fs = write_scalar (fs, ((scalar *)g->pointer)[j]);
1099 else if (g->type == sym_VECTOR)
1100 fs = write_vector (fs, ((vector *)g->pointer)[j]);
1101 else if (g->type == sym_TENSOR)
1102 fs = write_tensor (fs, ((tensor *)g->pointer)[j]);
1106 else { // last element of a list is always ignored (this is necessary for empty lists)
1107 if (g->type == sym_SCALAR)
1108 fs = str_append (fs, "{0,0}");
1109 else if (g->type == sym_VECTOR)
1110 fs = str_append (fs, "{{0,0},{0,0}}");
1111 else if (g->type == sym_TENSOR)
1112 fs = str_append (fs, "{{{0,0},{0,0}},{{0,0},{0,0}}}");
1117 fs = str_append (fs, ";\n");
1118 else if (j < size - 1)
1119 fs = str_append (fs, ",");
1121 fs = str_append (fs, "};\n");
1296static void gpu_cpu_sync_scalar (scalar s, char * sep, GLenum mode)
1298 assert ((mode == GL_MAP_READ_BIT && s.gpu.stored < 0) ||
1299 (mode == GL_MAP_WRITE_BIT && s.gpu.stored > 0));
1300 if (s.gpu.stored > 0 && !(s.stencil.bc & s_centered))
1302 GL_C (glMemoryBarrier (GL_BUFFER_UPDATE_BARRIER_BIT));
1303 size_t size = (size_t)field_size()*sizeof(real), offset = s.i*size, totalsize = s.block*size;
1304 char * cd = grid_data() + offset;
1305 int index = offset/GPUContext.max_ssbo_size;
1306 offset -= index*GPUContext.max_ssbo_size;
1308 GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, GPUContext.ssbo[index]));
1309 size_t size = min (totalsize, GPUContext.max_ssbo_size - offset);
1311 // fprintf (stderr, "map %d %ld %ld\n", index, offset, size);
1313 char * gd = glMapBufferRange (GL_SHADER_STORAGE_BUFFER, offset, size, mode);
1315 if (mode == GL_MAP_READ_BIT)
1316 memcpy (cd, gd, size);
1317 else if (mode == GL_MAP_WRITE_BIT)
1318 memcpy (gd, cd, size);
1321 assert (glUnmapBuffer (GL_SHADER_STORAGE_BUFFER));
1322 cd += size, totalsize -= size, offset = 0, index++;
1324 GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, 0));
1326 fprintf (stderr, "%s%s", sep, s.name);
1359void reset_gpu (void * alist, double val)
1361 size_t size = (size_t)field_size()*sizeof(real);
1362 scalar * list = alist;
1363 for (int _s = 0; _s < 1; _s++) /* scalar in list */
1364 if (!is_constant(s)) {
1365 size_t offset = s.i*size, totalsize = max(s.block, 1)*size;
1366 int index = offset/GPUContext.max_ssbo_size;
1367 offset -= index*GPUContext.max_ssbo_size;
1369 GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, GPUContext.ssbo[index]));
1370 size_t size = min (totalsize, GPUContext.max_ssbo_size - offset);
1373 GL_C (glClearBufferSubData (GL_SHADER_STORAGE_BUFFER, GL_R32F,
1375 GL_RED, GL_FLOAT, &fval));
1377 GL_C (glClearBufferSubData (GL_SHADER_STORAGE_BUFFER, GL_RG32UI,
1379 GL_RG_INTEGER, GL_UNSIGNED_INT, &val));
1381 totalsize -= size, offset = 0, index++;
1385 GL_C (glBindBuffer (GL_SHADER_STORAGE_BUFFER, 0));
1462static External * merge_externals (External * externals, const ForeachData * loop)
1464 External * merged = NULL, * end = NULL;
1465 static External ext[] = {
1466 { .name = "X0", .type = sym_DOUBLE, .pointer = &X0, .global = 1 },
1467 { .name = "Y0", .type = sym_DOUBLE, .pointer = &Y0, .global = 1 },
1468 { .name = "Z0", .type = sym_DOUBLE, .pointer = &Z0, .global = 1 },
1469 { .name = "L0", .type = sym_DOUBLE, .pointer = &L0, .global = 1 },
1470 { .name = "N", .type = sym_INT, .pointer = &N, .global = 1 },
1472 { .name = "nl", .type = sym_INT, .pointer = &nl, .global = 1 },
1473 { .name = "_layer", .type = sym_INT, .pointer = &_layer, .global = 1 },
1474 { .name = ".block", .type = sym_INT, .nd = attroffset (block) },
1478 static External bc = {
1479 .name = "apply_bc", .type = sym_function_declaration, .pointer = (void *)(long)apply_bc
1482 for (External * g = externals; g->name; g++) {
1484 if (g->global == 2) g->global = 0;
1486 foreach_function (f, f->used = false);
1487 for (External * g = ext; g->name; g++) {
1489 merged = merge_external (merged, &end, g, loop);
1493 merged = merge_external (merged, &end, &bc, loop);
1495 for (External * g = externals; g->name; g++)
1496 merged = merge_external (merged, &end, g, loop);
1498 for (External * i = merged; i; i = i->next)
1499 fprintf (stderr, "external %s %d %p %d\n", i->name, i->type, i->pointer, i->global);
1502 for (External * g = merged; g; g = g->next)
1503 if (g->global && !strcmp (g->name, "apply_bc_list"))
1504 g->pointer = loop->dirty;
1509static Shader * compile_shader (ForeachData * loop,
1511 const RegionParameters * region,
1512 External * externals,
1513 const char * kernel)
1515 const char * error = strstr (kernel, "@error ");
1517 for (const char * s = error + 7; *s != '\
n' && *s != '\0
'; s++)
1519 fputc ('\
n', stderr);
1524 External * merged = merge_externals (externals, loop);
1531 for (const External * g = merged; g; g = g->next) {
1534 if (g->type != sym_SCALAR && g->type != sym_VECTOR && g->type != sym_TENSOR) {
1535 if (g->reduct && !strchr ("+mM", g->reduct)) {
1538 "%s:%d: GLSL: error: unknown reduction operation '%
c'\n",
1539 loop->fname, loop->line, g->reduct);
1542 if (g->type == sym_COORD || g->type == sym__COORD || g->type == sym_VEC4) {
1546 "%s:%d: GLSL: error: reductions not implemented for '%
s' type\n",
1547 loop->fname, loop->line, type_string (g));
1551 else if (g->type != sym_FLOAT &&
1552 g->type != sym_DOUBLE &&
1553 g->type != sym_INT &&
1554 g->type != sym_LONG &&
1555 g->type != sym_BOOL &&
1556 g->type != sym_enumeration_constant &&
1557 g->type != sym_IVEC &&
1558 g->type != sym_function_declaration &&
1559 g->type != sym_function_definition) {
1561 fprintf (stderr, "%s:%d: GLSL: error: unknown type %d for '%
s'\n",
1562 loop->fname, loop->line, g->type, g->name);
1571 static const int NWG[2] = {16, 16};
1572 GLuint ng[2], nwg[2];
1573 int Nl = region->level > 0 ? 1 << (region->level - 1) : N/Dimensions.x;
1574 int * dims = &Dimensions.x;
1575 if (loop->face || loop->vertex)
1576 for (int i = 0; i < 2; i++) {
1577 if (Nl*dims[1-i] > NWG[i]) {
1578 nwg[i] = NWG[i] + 1;
1579 ng[i] = Nl*dims[1-i]/NWG[i];
1582 nwg[i] = Nl*dims[1-i] + 1;
1585 assert (nwg[i]*ng[i] >= Nl*dims[1-i] + 1);
1588 for (int i = 0; i < 2; i++) {
1589 nwg[i] = Nl < NWG[i] ? Nl : NWG[i];
1590 ng[i] = Nl*dims[1-i]/nwg[i];
1591 assert (nwg[i]*ng[i] == Nl*dims[1-i]);
1594 char * shader = build_shader (merged, loop, region, nwg);
1602 shader = str_append (shader, "void _loop (");
1603 for (const External * g = merged; g; g = g->next)
1604 if (g->global == 2) {
1605 shader = str_append (shader, local++ == 1 ? "" : ", ", type_string (g), " ", g->name);
1607 int size = list_size (g);
1609 char s[20]; snprintf (s, 19, "%d", size);
1610 shader = str_append (shader, "[", s, "]");
1614 shader = str_append (shader, ") {\n");
1617 shader = str_append (shader, "void main() {\n");
1619 if (!GPUContext.fragment_shader) {
1621 snprintf (d, 19, "%d", region->level > 0 ? region->level - 1 : depth());
1622 shader = str_append (shader, "Point point = {csOrigin.x + int(gl_GlobalInvocationID.y) + GHOSTS,"
1623 "csOrigin.y + int(gl_GlobalInvocationID.x) + GHOSTS,", d,
1625 ",ivec2((1<<",d,")*Dimensions.x,(1<<",d,")*Dimensions.y)"
1636 shader = str_append (shader,
1637 "if (point.i < N*Dimensions.x + 2*GHOSTS && "
1638 "point.j < N*Dimensions.y + 2*GHOSTS) {\n");
1640 shader = str_append (shader, " int ig = -1, jg = -1;\n");
1641 shader = str_append (shader, kernel);
1642 shader = str_append (shader, "\nif (point.j - GHOSTS < NY) {");
1643 for (const External * g = merged; g; g = g->next)
1645 shader = str_append (shader, "\n val_red_(", g->name, "_out_) = ", g->name, ";");
1649 shader = str_append (shader, "\n}",
1650 loop->dirty ? "apply_bc(point);" : "",
1654 shader = str_append (shader, "void main(){_loop(");
1656 for (const External * g = merged; g; g = g->next)
1658 shader = str_append (shader, local++ == 1 ? "" : ",", EXTERNAL_NAME (g));
1659 shader = str_append (shader, ");}\n");
1662 Shader * s = load_shader (shader, hash, loop);
1666 s->ng[0] = ng[0], s->ng[1] = ng[1];
1671 for (External * g = merged; g; g = g->next)
1674 for (External * g = externals; g && g->name; g++)
1677 for (const External * g = merged; g; g = g->next) {
1678 if (g->name[0] == '.
') continue;
1679 if (IS_EXTERNAL_CONSTANT(g)) continue;
1680 if (g->type == sym_function_declaration || g->type == sym_function_definition) continue;
1681 if (g->type == sym_INT && (!strcmp (g->name, "N") ||
1682 !strcmp (g->name, "nl") ||
1683 !strcmp (g->name, "bc_period_x") ||
1684 !strcmp (g->name, "bc_period_y")))
1686 if (g->type == sym_INT ||
1687 g->type == sym_LONG ||
1688 g->type == sym_FLOAT ||
1689 g->type == sym_DOUBLE ||
1690 g->type == sym__COORD ||
1691 g->type == sym_COORD ||
1692 g->type == sym_BOOL ||
1693 g->type == sym_VEC4) {
1694 char * name = str_append (NULL, EXTERNAL_NAME (g));
1695 int location = glGetUniformLocation (s->id, name);
1697 if (location >= 0) {
1698 // fprintf (stderr, "%s:%d: %s\n", loop->fname, loop->line, name);
1699 // not an array or just a one-dimensional array
1701 assert (!g->data || ((int *)g->data)[1] == 0);
1702 int nd = g->data ? ((int *)g->data)[0] : 1;
1703 s->uniforms = realloc (s->uniforms, (nuniforms + 2)*sizeof(MyUniform));
1704 s->uniforms[nuniforms] = (MyUniform){
1705 .location = location, .type = g->type, .nd = nd,
1706 .local = g->global == 1 ? -1 : g->used - 1,
1707 .pointer = g->global == 1 ? g->pointer : NULL };
1708 s->uniforms[nuniforms + 1].type = 0;
1710 // uniforms refering to local variables must be in the 'externals
' local list
1711 assert (g->global == 1 || g->used);
1730static Shader * setup_shader (ForeachData * loop, const RegionParameters * region,
1731 External * externals,
1732 const char * kernel)
1738 apply_bc_list = loop->dirty;
1739 for (int _s = 0; _s < 1; _s++) /* scalar in loop->dirty */ {
1745 for (int b = 0; b < nboundary; b++)
1746 if (s.boundary_stencil[b])
1747 s.boundary_stencil[b] ((Point){0}, (Point){0}, s, NULL);
1752 if (!(s.stencil.io & s_output))
1753 s.stencil.io |= s_output;
1756 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
1759 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
1760 if (((s.stencil.io & s_input) || (s.stencil.io & s_output)) && !s.gpu.index) {
1762 fprintf (stderr, "%s:%d: %s:%s%s index: %d\n",
1763 loop->fname, loop->line,
1764 s.name, (s.stencil.io & s_input) ? " input" : "",
1765 (s.stencil.io & s_output) ? " output" : "", index);
1767 if (s.v.x.i == -1) // scalar
1768 s.gpu.index = index++;
1771 for (int _c = 0; _c < 1; _c++) /* scalar in {v} */
1773 c.gpu.index = index++;
1776 for (int _s = 0; _s < 1; _s++) /* scalar in loop->dirty */ {
1778 fprintf (stderr, "%s:%d: dirty: %s:%s%s index: %d\n",
1779 loop->fname, loop->line, s.name,
1780 (s.stencil.io & s_input) ? " input" : "",
1781 (s.stencil.io & s_output) ? " output" : "",
1784 s.boundary_left = s.boundary[left];
1785 s.boundary_right = s.boundary[right];
1786 s.boundary_top = s.boundary[top];
1787 s.boundary_bottom = s.boundary[bottom];
1793 for (External * g = externals; g && g->name; g++)
1795 scalar s = {0} /* new scalar */;
1796 s.stencil.io |= s_output;
1799 fprintf (stderr, "%s:%d: new reduction field %d for %s\n",
1800 loop->fname, loop->line, s.i, g->name);
1808 uint32_t hash = hash_shader (externals, loop, region, kernel);
1809 assert (gpu_grid->shaders);
1810 khiter_t k = kh_get (INT, gpu_grid->shaders, hash);
1811 if (k != kh_end (gpu_grid->shaders))
1812 shader = kh_value (gpu_grid->shaders, k);
1814 shader = compile_shader (loop, hash, region, externals, kernel);
1816 free_reduction_fields (externals);
1821 gpu_cpu_sync (baseblock, GL_MAP_WRITE_BIT, loop->fname, loop->line);
1829 scalar * listc = NULL;
1830 for (int _s = 0; _s < 1; _s++) /* scalar in loop->listc */
1831 if (!(s.stencil.bc & s_centered))
1832 listc = list_prepend (listc, s);
1833 scalar * listf_x = NULL, * listf_y = NULL;
1834 for (int _d = 0; _d < dimension; _d++)
1835 for (int _s = 0; _s < 1; _s++) /* scalar in loop->listf.x */
1836 if (!(s.stencil.bc & s_face))
1837 listf_x = list_prepend (listf_x, s);
1838 if (listc || listf_x || listf_y) {
1840 fprintf (stderr, "%s:%d: applying BCs for", loop->fname, loop->line);
1841 for (int _s = 0; _s < 1; _s++) /* scalar in listc */
1842 fprintf (stderr, " %s", s.name);
1843 for (int _d = 0; _d < dimension; _d++)
1844 for (int _s = 0; _s < 1; _s++) /* scalar in listf_x */
1845 fprintf (stderr, " %s", s.name);
1846 fputc ('\
n', stderr);
1848 int nvar = datasize/sizeof(real);
1849 _Attributes backup[nvar];
1850 memcpy (backup, _attribute, nvar*sizeof(_Attributes));
1852 for (int _s = 0; _s < 1; _s++) /* scalar in listc */
1853 s[] = s[]; // does nothing but ensures that BCs are applied at the end of the loop
1854 for (int _d = 0; _d < dimension; _d++)
1855 for (int _s = 0; _s < 1; _s++) /* scalar in listf_x */
1858 memcpy (_attribute, backup, nvar*sizeof(_Attributes));
1859 for (int _s = 0; _s < 1; _s++) /* scalar in listc */ {
1860 s.stencil.bc |= s_centered;
1864 for (int _d = 0; _d < dimension; _d++) {
1865 for (int _s = 0; _s < 1; _s++) /* scalar in listf_x */ {
1866 s.stencil.bc |= s_face;
1871 apply_bc_list = loop->dirty;
1880 if (shader->id != GPUContext.current_shader) {
1881 GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, 0, 0));
1882 GL_C (glUseProgram (shader->id));
1883 for (int i = 0; i < GPUContext.nssbo; i++)
1884 GL_C (glBindBufferBase (GL_SHADER_STORAGE_BUFFER, i, GPUContext.ssbo[i]));
1885 GPUContext.current_shader = shader->id;
1891 for (const MyUniform * g = shader->uniforms; g && g->type; g++) {
1892 void * pointer = g->pointer;
1894 assert (g->local >= 0);
1895 pointer = externals[g->local].pointer;
1899 glUniform1iv (g->location, g->nd, pointer); break;
1901 glUniform1fv (g->location, g->nd, pointer); break;
1903 glUniform4fv (g->location, g->nd, pointer); break;
1906 bool * data = pointer;
1907 for (int i = 0; i < g->nd; i++)
1909 glUniform1iv (g->location, g->nd, p);
1914 long * data = pointer;
1915 for (int i = 0; i < g->nd; i++)
1917 glUniform1iv (g->location, g->nd, p);
1923 double * data = pointer;
1924 for (int i = 0; i < g->nd; i++)
1926 glUniform1fv (g->location, g->nd, p);
1931 double * data = pointer;
1932 for (int i = 0; i < 2*g->nd; i++)
1934 glUniform2fv (g->location, g->nd, p);
1939 double * data = pointer;
1940 for (int i = 0; i < 3*g->nd; i++)
1942 glUniform3fv (g->location, g->nd, p);
1945#else // DOUBLE_PRECISION
1947 glUniform1dv (g->location, g->nd, pointer); break;
1949 glUniform2dv (g->location, g->nd, pointer); break;
1951 glUniform3dv (g->location, g->nd, pointer); break;
1952#endif // DOUBLE_PRECISION
1961static bool doloop_on_gpu (ForeachData * loop, const RegionParameters * region,
1962 External * externals,
1963 const char * kernel)
1965 Shader * shader = setup_shader (loop, region, externals, kernel);
1974 int Nl = region->level > 0 ? 1 << (region->level - 1) : N/Dimensions.x;
1975 if (region->n.x == 1 && region->n.y == 1) {
1977 (region->p.x - X0)/L0*Nl*Dimensions.x,
1978 (region->p.y - Y0)/L0*Nl*Dimensions.x
1980 GL_C (glUniform2iv (0, 1, csOrigin));
1981 assert (!GPUContext.fragment_shader);
1982 GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
1983 GL_C (glDispatchCompute (1, 1, 1));
1989 else if (region->n.x || region->n.y) {
1991 (region->box[1].x - region->box[0].x)/L0,
1992 (region->box[1].y - region->box[0].y)/L0
1994 float vsOrigin[] = { (region->box[0].x - X0)/L0, (region->box[0].y - Y0)/L0 };
1995 GL_C (glUniform2fv (1, 1, vsOrigin));
1996 GL_C (glUniform2fv (2, 1, vsScale));
1997 assert (GPUContext.fragment_shader);
1998 GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
1999 GL_C (glDrawArrays (GL_TRIANGLES, 0, 6));
2003 assert (!GPUContext.fragment_shader);
2004 GL_C (glMemoryBarrier (GL_SHADER_STORAGE_BARRIER_BIT));
2005 GL_C (glDispatchCompute (shader->ng[0], shader->ng[1], 1));
2011 bool nreductions = false;
2012 for (const External * g = externals; g && g->name; g++)
2018 tracing ("gpu_reduction", loop->fname, loop->line);
2019 for (const External * g = externals; g && g->name; g++)
2022 double result = gpu_reduction (field_offset(s, region->level), g->reduct, region,
2023 loop->face == 1 ? (Nl*Dimensions.x + 1)*Nl*Dimensions.y :
2024 loop->face == 2 ? Nl*Dimensions.x*(Nl*Dimensions.y + 1) :
2025 loop->face == 3 || loop->vertex ?
2026 (Nl*Dimensions.x + 1)*(Nl*Dimensions.y + 1) :
2027 sq(Nl)*Dimensions.x*Dimensions.y);
2029 fprintf (stderr, "%s:%d: %s %c %g\n",
2030 loop->fname, loop->line, g->name, g->reduct, result);
2032 if (g->type == sym_DOUBLE) *((double *)g->pointer) = result;
2033 else if (g->type == sym_FLOAT) *((float *)g->pointer) = result;
2034 else if (g->type == sym_INT) *((int *)g->pointer) = result;
2040 end_tracing ("gpu_reduction", loop->fname, loop->line);
2045bool gpu_end_stencil (ForeachData * loop,
2046 const RegionParameters * region,
2047 External * externals,
2048 const char * kernel)
2050 bool on_gpu = ((loop->parallel == 1 && !on_cpu) || loop->parallel == 3) &&
2051 (loop->first || loop->data);
2053 on_gpu = doloop_on_gpu (loop, region, externals, kernel);
2055 fprintf (stderr, "%s:%d: %s: for (int _i = 0; _i < _N; _i++) /* foreach */ done on CPU (see GLSL errors above)\n",
2056 loop->fname, loop->line, loop->parallel == 3 ? "error" : "warning");
2057 if (loop->parallel == 3) // must run on GPU but cannot run
2063 // do not apply BCs on CPU
2064 free (loop->listc), loop->listc = NULL;
2065 for (int _d = 0; _d < dimension; _d++)
2066 free (loop->listf.x), loop->listf.x = NULL;
2067 for (int _s = 0; _s < 1; _s++) /* scalar in loop->dirty */
2068 s.stencil.bc = s_centered|s_face;
2069 free (loop->dirty), loop->dirty = NULL;
2072 gpu_cpu_sync (baseblock, GL_MAP_READ_BIT, loop->fname, loop->line);
2073 boundary_stencil (loop);
2076 for (int _s = 0; _s < 1; _s++) /* scalar in baseblock */
2077 if (s.stencil.io & s_output)
2078 s.gpu.stored = on_gpu ? -1 : 1;
2080 return on_gpu && loop->parallel != 3;