// // Copyright (c) 2020 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #ifndef SUBGROUPCOMMONTEMPLATES_H #define SUBGROUPCOMMONTEMPLATES_H #include "typeWrappers.h" #include #include "CL/cl_half.h" #include "subhelpers.h" #include typedef std::bitset<128> bs128; static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, const std::string &mask_type, cl_uint max_sub_group_size) { bs128 mask128; cl_uint4 mask; cl_uint pos = subgroup_local_id; if (mask_type == "eq") mask128.set(pos); if (mask_type == "le" || mask_type == "lt") { for (cl_uint i = 0; i <= pos; i++) mask128.set(i); if (mask_type == "lt") mask128.reset(pos); } if (mask_type == "ge" || mask_type == "gt") { for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i); if (mask_type == "gt") mask128.reset(pos); } // convert std::bitset<128> to uint4 auto const uint_mask = bs128{ static_cast(-1) }; mask.s0 = (mask128 & uint_mask).to_ulong(); mask128 >>= 32; mask.s1 = (mask128 & uint_mask).to_ulong(); mask128 >>= 32; mask.s2 = (mask128 & uint_mask).to_ulong(); mask128 >>= 32; mask.s3 = (mask128 & uint_mask).to_ulong(); return mask; } // DESCRIPTION : // sub_group_broadcast - each work_item registers it's own value. // All work_items in subgroup takes one value from only one (any) work_item // sub_group_broadcast_first - same as type 0. All work_items in // subgroup takes only one value from only one chosen (the smallest subgroup ID) // work_item // sub_group_non_uniform_broadcast - same as type 0 but // only 4 work_items from subgroup enter the code (are active) template struct BC { static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int i, ii, j, k, n; int ng = test_params.global_workgroup_size; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int nj = (nw + ns - 1) / ns; int d = ns > 100 ? 100 : ns; int non_uniform_size = ng % nw; ng = ng / nw; int last_subgroup_size = 0; ii = 0; log_info(" sub_group_%s(%s)...\n", operation_names(operation), TypeManager::name()); if (non_uniform_size) { log_info(" non uniform work group size mode ON\n"); ng++; } for (k = 0; k < ng; ++k) { // for each work_group if (non_uniform_size && k == ng - 1) { set_last_workgroup_params(non_uniform_size, nj, ns, nw, last_subgroup_size); } for (j = 0; j < nj; ++j) { // for each subgroup ii = j * ns; if (last_subgroup_size && j == nj - 1) { n = last_subgroup_size; } else { n = ii + ns > nw ? nw - ii : ns; } int bcast_if = 0; int bcast_elseif = 0; int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff) % (d > n ? n : d); // l - calculate subgroup local id from which value will be // broadcasted (one the same value for whole subgroup) if (operation != SubgroupsBroadcastOp::broadcast) { // reduce brodcasting index in case of non_uniform and // last workgroup last subgroup if (last_subgroup_size && j == nj - 1 && last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS) { bcast_if = bcast_index % last_subgroup_size; bcast_elseif = bcast_if; } else { bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS; bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS + bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS); } } for (i = 0; i < n; ++i) { if (operation == SubgroupsBroadcastOp::broadcast) { int midx = 4 * ii + 4 * i + 2; m[midx] = (cl_int)bcast_index; } else { if (i < NR_OF_ACTIVE_WORK_ITEMS) { // index of the third // element int the vector. int midx = 4 * ii + 4 * i + 2; // storing information about // broadcasting index - // earlier calculated m[midx] = (cl_int)bcast_if; } else { // index of the third // element int the vector. int midx = 4 * ii + 4 * i + 3; m[midx] = (cl_int)bcast_elseif; } } // calculate value for broadcasting cl_ulong number = genrand_int64(gMTdata); set_value(t[ii + i], number); } } // Now map into work group using map from device for (j = 0; j < nw; ++j) { // for each element in work_group // calculate index as number of subgroup // plus subgroup local id x[j] = t[j]; } x += nw; m += 4 * nw; } } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, l, n; int ng = test_params.global_workgroup_size; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int nj = (nw + ns - 1) / ns; Ty tr, rr; int non_uniform_size = ng % nw; ng = ng / nw; int last_subgroup_size = 0; if (non_uniform_size) ng++; for (k = 0; k < ng; ++k) { // for each work_group if (non_uniform_size && k == ng - 1) { set_last_workgroup_params(non_uniform_size, nj, ns, nw, last_subgroup_size); } for (j = 0; j < nw; ++j) { // inside the work_group mx[j] = x[j]; // read host inputs for work_group my[j] = y[j]; // read device outputs for work_group } for (j = 0; j < nj; ++j) { // for each subgroup ii = j * ns; if (last_subgroup_size && j == nj - 1) { n = last_subgroup_size; } else { n = ii + ns > nw ? nw - ii : ns; } // Check result if (operation == SubgroupsBroadcastOp::broadcast_first) { int lowest_active_id = -1; for (i = 0; i < n; ++i) { lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS ? 0 : NR_OF_ACTIVE_WORK_ITEMS; // findout if broadcasted // value is the same tr = mx[ii + lowest_active_id]; // findout if broadcasted to all rr = my[ii + i]; if (!compare(rr, tr)) { log_error( "ERROR: sub_group_broadcast_first(%s) " "mismatch " "for local id %d in sub group %d in group " "%d\n", TypeManager::name(), i, j, k); return TEST_FAIL; } } } else { for (i = 0; i < n; ++i) { if (operation == SubgroupsBroadcastOp::broadcast) { int midx = 4 * ii + 4 * i + 2; l = (int)m[midx]; tr = mx[ii + l]; } else { if (i < NR_OF_ACTIVE_WORK_ITEMS) { // take index of array where info // which work_item will be // broadcast its value is stored int midx = 4 * ii + 4 * i + 2; // take subgroup local id of // this work_item l = (int)m[midx]; // take value generated on host // for this work_item tr = mx[ii + l]; } else { int midx = 4 * ii + 4 * i + 3; l = (int)m[midx]; tr = mx[ii + l]; } } rr = my[ii + i]; // read device outputs for // work_item in the subgroup if (!compare(rr, tr)) { log_error("ERROR: sub_group_%s(%s) " "mismatch for local id %d in sub " "group %d in group %d - got %lu " "expected %lu\n", operation_names(operation), TypeManager::name(), i, j, k, rr, tr); return TEST_FAIL; } } } } x += nw; y += nw; m += 4 * nw; } log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), TypeManager::name()); return TEST_PASS; } }; static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); } static subgroups::cl_half to_half(float x) { subgroups::cl_half value; value.data = cl_half_from_float(x, CL_HALF_RTE); return value; } // for integer types template inline Ty calculate(Ty a, Ty b, ArithmeticOp operation) { switch (operation) { case ArithmeticOp::add_: return a + b; case ArithmeticOp::max_: return a > b ? a : b; case ArithmeticOp::min_: return a < b ? a : b; case ArithmeticOp::mul_: return a * b; case ArithmeticOp::and_: return a & b; case ArithmeticOp::or_: return a | b; case ArithmeticOp::xor_: return a ^ b; case ArithmeticOp::logical_and: return a && b; case ArithmeticOp::logical_or: return a || b; case ArithmeticOp::logical_xor: return !a ^ !b; default: log_error("Unknown operation request"); break; } return 0; } // Specialize for floating points. template <> inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation) { switch (operation) { case ArithmeticOp::add_: { return a + b; } case ArithmeticOp::max_: { return a > b ? a : b; } case ArithmeticOp::min_: { return a < b ? a : b; } case ArithmeticOp::mul_: { return a * b; } default: log_error("Unknown operation request"); break; } return 0; } template <> inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation) { switch (operation) { case ArithmeticOp::add_: { return a + b; } case ArithmeticOp::max_: { return a > b ? a : b; } case ArithmeticOp::min_: { return a < b ? a : b; } case ArithmeticOp::mul_: { return a * b; } default: log_error("Unknown operation request"); break; } return 0; } template <> inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b, ArithmeticOp operation) { switch (operation) { case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b)); case ArithmeticOp::max_: return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b; case ArithmeticOp::min_: return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b; case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b)); default: log_error("Unknown operation request"); break; } return to_half(0); } template bool is_floating_point() { return std::is_floating_point::value || std::is_same::value; } template void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) { int nj = (nw + ns - 1) / ns; for (int k = 0; k < ng; ++k) { for (int j = 0; j < nj; ++j) { int ii = j * ns; int n = ii + ns > nw ? nw - ii : ns; for (int i = 0; i < n; ++i) { cl_ulong out_value; double y; if (operation == ArithmeticOp::mul_ || operation == ArithmeticOp::add_) { // work around to avoid overflow, do not use 0 for // multiplication out_value = (genrand_int32(gMTdata) % 4) + 1; } else { out_value = genrand_int64(gMTdata) % (32 * n); if ((operation == ArithmeticOp::logical_and || operation == ArithmeticOp::logical_or || operation == ArithmeticOp::logical_xor) && ((out_value >> 32) & 1) == 0) out_value = 0; // increase probability of false } set_value(t[ii + i], out_value); } } // Now map into work group using map from device for (int j = 0; j < nw; ++j) { x[j] = t[j]; } x += nw; m += 4 * nw; } } template struct SHF { static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int i, ii, j, k, l, n, delta; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; int d = ns > 100 ? 100 : ns; ii = 0; ng = ng / nw; log_info(" sub_group_%s(%s)...\n", operation_names(operation), TypeManager::name()); for (k = 0; k < ng; ++k) { // for each work_group for (j = 0; j < nj; ++j) { // for each subgroup ii = j * ns; n = ii + ns > nw ? nw - ii : ns; for (i = 0; i < n; ++i) { int midx = 4 * ii + 4 * i + 2; l = (int)(genrand_int32(gMTdata) & 0x7fffffff) % (d > n ? n : d); switch (operation) { case ShuffleOp::shuffle: case ShuffleOp::shuffle_xor: // storing information about shuffle index m[midx] = (cl_int)l; break; case ShuffleOp::shuffle_up: delta = l; // calculate delta for shuffle up if (i - delta < 0) { delta = i; } m[midx] = (cl_int)delta; break; case ShuffleOp::shuffle_down: delta = l; // calculate delta for shuffle down if (i + delta >= n) { delta = n - 1 - i; } m[midx] = (cl_int)delta; break; default: break; } cl_ulong number = genrand_int64(gMTdata); set_value(t[ii + i], number); } } // Now map into work group using map from device for (j = 0; j < nw; ++j) { // for each element in work_group x[j] = t[j]; } x += nw; m += 4 * nw; } } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, l, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; for (k = 0; k < ng; ++k) { // for each work_group for (j = 0; j < nw; ++j) { // inside the work_group mx[j] = x[j]; // read host inputs for work_group my[j] = y[j]; // read device outputs for work_group } for (j = 0; j < nj; ++j) { // for each subgroup ii = j * ns; n = ii + ns > nw ? nw - ii : ns; for (i = 0; i < n; ++i) { // inside the subgroup // shuffle index storage int midx = 4 * ii + 4 * i + 2; l = (int)m[midx]; rr = my[ii + i]; switch (operation) { // shuffle basic - treat l as index case ShuffleOp::shuffle: tr = mx[ii + l]; break; // shuffle up - treat l as delta case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break; // shuffle up - treat l as delta case ShuffleOp::shuffle_down: tr = mx[ii + i + l]; break; // shuffle xor - treat l as mask case ShuffleOp::shuffle_xor: tr = mx[ii + (i ^ l)]; break; default: break; } if (!compare(rr, tr)) { log_error("ERROR: sub_group_%s(%s) mismatch for " "local id %d in sub group %d in group %d\n", operation_names(operation), TypeManager::name(), i, j, k); return TEST_FAIL; } } } x += nw; y += nw; m += 4 * nw; } log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), TypeManager::name()); return TEST_PASS; } }; template struct SCEX_NU { static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" : func_name = "sub_group_scan_exclusive"; log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " "size = %d work item mask = 0x%x \n", test_params.global_workgroup_size, nw, ns, work_items_mask); genrand(x, t, m, ns, nw, ng); } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" : func_name = "sub_group_scan_exclusive"; uint32_t use_work_items_mask; // for uniform case take into consideration all workitems use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; for (k = 0; k < ng; ++k) { // for each work_group // Map to array indexed to array indexed by local ID and sub group for (j = 0; j < nw; ++j) { // inside the work_group mx[j] = x[j]; // read host inputs for work_group my[j] = y[j]; // read device outputs for work_group } for (j = 0; j < nj; ++j) { ii = j * ns; n = ii + ns > nw ? nw - ii : ns; std::set active_work_items; for (i = 0; i < n; ++i) { uint32_t check_work_item = 1 << (i % 32); if (use_work_items_mask & check_work_item) { active_work_items.insert(i); } } if (active_work_items.empty()) { log_info(" No acitve workitems in workgroup id = %d " "subgroup id = %d - no calculation\n", k, j); continue; } else if (active_work_items.size() == 1) { log_info(" One active workitem in workgroup id = %d " "subgroup id = %d - no calculation\n", k, j); continue; } else { tr = TypeManager::identify_limits(operation); int idx = 0; for (const int &active_work_item : active_work_items) { rr = my[ii + active_work_item]; if (idx == 0) continue; if (!compare_ordered(rr, tr)) { log_error( "ERROR: %s_%s(%s) " "mismatch for local id %d in sub group %d in " "group %d Expected: %d Obtained: %d\n", func_name.c_str(), operation_names(operation), TypeManager::name(), i, j, k, tr, rr); return TEST_FAIL; } tr = calculate(tr, mx[ii + active_work_item], operation); idx++; } } } x += nw; y += nw; m += 4 * nw; } log_info(" %s_%s(%s)... passed\n", func_name.c_str(), operation_names(operation), TypeManager::name()); return TEST_PASS; } }; // Test for scan inclusive non uniform functions template struct SCIN_NU { static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" : func_name = "sub_group_scan_inclusive"; genrand(x, t, m, ns, nw, ng); log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " "size = %d work item mask = 0x%x \n", test_params.global_workgroup_size, nw, ns, work_items_mask); } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" : func_name = "sub_group_scan_inclusive"; uint32_t use_work_items_mask; // for uniform case take into consideration all workitems use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; // std::bitset<32> mask32(use_work_items_mask); // for (int k) mask32.count(); for (k = 0; k < ng; ++k) { // for each work_group // Map to array indexed to array indexed by local ID and sub group for (j = 0; j < nw; ++j) { // inside the work_group mx[j] = x[j]; // read host inputs for work_group my[j] = y[j]; // read device outputs for work_group } for (j = 0; j < nj; ++j) { ii = j * ns; n = ii + ns > nw ? nw - ii : ns; std::set active_work_items; int catch_frist_active = -1; for (i = 0; i < n; ++i) { uint32_t check_work_item = 1 << (i % 32); if (use_work_items_mask & check_work_item) { if (catch_frist_active == -1) { catch_frist_active = i; } active_work_items.insert(i); } } if (active_work_items.empty()) { log_info(" No acitve workitems in workgroup id = %d " "subgroup id = %d - no calculation\n", k, j); continue; } else { tr = TypeManager::identify_limits(operation); for (const int &active_work_item : active_work_items) { rr = my[ii + active_work_item]; if (active_work_items.size() == 1) { tr = mx[ii + catch_frist_active]; } else { tr = calculate(tr, mx[ii + active_work_item], operation); } if (!compare_ordered(rr, tr)) { log_error( "ERROR: %s_%s(%s) " "mismatch for local id %d in sub group %d " "in " "group %d Expected: %d Obtained: %d\n", func_name.c_str(), operation_names(operation), TypeManager::name(), active_work_item, j, k, tr, rr); return TEST_FAIL; } } } } x += nw; y += nw; m += 4 * nw; } log_info(" %s_%s(%s)... passed\n", func_name.c_str(), operation_names(operation), TypeManager::name()); return TEST_PASS; } }; // Test for reduce non uniform functions template struct RED_NU { static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) { int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_reduce" : func_name = "sub_group_reduce"; log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " "size = %d work item mask = 0x%x \n", test_params.global_workgroup_size, nw, ns, work_items_mask); genrand(x, t, m, ns, nw, ng); } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, const WorkGroupParams &test_params) { int ii, i, j, k, n; int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; ng = ng / nw; Ty tr, rr; std::string func_name; work_items_mask ? func_name = "sub_group_non_uniform_reduce" : func_name = "sub_group_reduce"; for (k = 0; k < ng; ++k) { // Map to array indexed to array indexed by local ID and sub // group for (j = 0; j < nw; ++j) { mx[j] = x[j]; my[j] = y[j]; } uint32_t use_work_items_mask; use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; for (j = 0; j < nj; ++j) { ii = j * ns; n = ii + ns > nw ? nw - ii : ns; std::set active_work_items; int catch_frist_active = -1; for (i = 0; i < n; ++i) { uint32_t check_work_item = 1 << (i % 32); if (use_work_items_mask & check_work_item) { if (catch_frist_active == -1) { catch_frist_active = i; tr = mx[ii + i]; active_work_items.insert(i); continue; } active_work_items.insert(i); tr = calculate(tr, mx[ii + i], operation); } } if (active_work_items.empty()) { log_info(" No acitve workitems in workgroup id = %d " "subgroup id = %d - no calculation\n", k, j); continue; } for (const int &active_work_item : active_work_items) { rr = my[ii + active_work_item]; if (!compare_ordered(rr, tr)) { log_error("ERROR: %s_%s(%s) " "mismatch for local id %d in sub group %d in " "group %d Expected: %d Obtained: %d\n", func_name.c_str(), operation_names(operation), TypeManager::name(), active_work_item, j, k, tr, rr); return TEST_FAIL; } } } x += nw; y += nw; m += 4 * nw; } log_info(" %s_%s(%s)... passed\n", func_name.c_str(), operation_names(operation), TypeManager::name()); return TEST_PASS; } }; #endif