// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "harness/compat.h" #include #include #include #include #include #include #include "procs.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" #include "harness/errorHelpers.h" // Outputs debug information for stores #define DEBUG 0 // Forces stores/loads to be done with offsets = tid #define LINEAR_OFFSETS 0 #define NUM_LOADS 512 static const char *doubleExtensionPragma = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"; #pragma mark -------------------- vload harness -------------------------- typedef void (*create_vload_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ); int test_vload( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize, create_vload_program_fn createFn, size_t bufferSize, MTdata d ) { int error; clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[ 4 ]; const size_t numLoads = (DEBUG) ? 16 : NUM_LOADS; if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128; size_t threads[ 1 ], localThreads[ 1 ]; clProtectedArray inBuffer( bufferSize ); char programSrc[ 10240 ]; cl_uint offsets[ numLoads ], alignmentOffsets[ numLoads ]; size_t numElements, typeSize, i; unsigned int outVectorSize; typeSize = get_explicit_type_size( type ); numElements = bufferSize / ( typeSize * vecSize ); bufferSize = numElements * typeSize * vecSize; // To account for rounding if (DEBUG) log_info("Testing: numLoads: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numLoads, (int)typeSize, vecSize, (int)numElements, (int)bufferSize); // Create some random input data and random offsets to load from generate_random_data( type, numElements * vecSize, d, (void *)inBuffer ); for( i = 0; i < numLoads; i++ ) { offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 1, d ); if( offsets[ i ] < numElements - 2 ) alignmentOffsets[ i ] = (cl_uint)random_in_range( 0, (int)vecSize - 1, d ); else alignmentOffsets[ i ] = 0; if (LINEAR_OFFSETS) offsets[i] = (cl_uint)i; } if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n"); // 32-bit fixup outVectorSize = vecSize; // Declare output buffers now #if !(defined(_WIN32) && defined(_MSC_VER)) char outBuffer[ numLoads * typeSize * outVectorSize ]; char referenceBuffer[ numLoads * typeSize * vecSize ]; #else char* outBuffer = (char*)_malloca(numLoads * typeSize * outVectorSize * sizeof(cl_char)); char* referenceBuffer = (char*)_malloca(numLoads * typeSize * vecSize * sizeof(cl_char)); #endif // Create the program createFn( programSrc, numElements, type, vecSize, outVectorSize); // Create our kernel const char *ptr = programSrc; error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" ); test_error( error, "Unable to create testing kernel" ); if (DEBUG) log_info("Kernel: \n%s\n", programSrc); // Get the number of args to differentiate the kernels with local storage. (They have 5) cl_uint numArgs; error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL); test_error( error, "clGetKernelInfo failed"); // Set up parameters streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, bufferSize, (void *)inBuffer, &error ); test_error( error, "Unable to create kernel stream" ); streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(offsets[0]), offsets, &error ); test_error( error, "Unable to create kernel stream" ); streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*sizeof(alignmentOffsets[0]), alignmentOffsets, &error ); test_error( error, "Unable to create kernel stream" ); streams[ 3 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numLoads*typeSize*outVectorSize, (void *)outBuffer, &error ); test_error( error, "Unable to create kernel stream" ); // Set parameters and run if (numArgs == 5) { // We need to set the size of the local storage error = clSetKernelArg(kernel, 0, bufferSize, NULL); test_error( error, "clSetKernelArg for buffer failed"); for( i = 0; i < 4; i++ ) { error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] ); test_error( error, "Unable to set kernel argument" ); } } else { // No local storage for( i = 0; i < 4; i++ ) { error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] ); test_error( error, "Unable to set kernel argument" ); } } threads[ 0 ] = numLoads; error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] ); test_error( error, "Unable to get local thread size" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to exec kernel" ); // Get the results error = clEnqueueReadBuffer( queue, streams[ 3 ], CL_TRUE, 0, numLoads * typeSize * outVectorSize * sizeof(cl_char), (void *)outBuffer, 0, NULL, NULL ); test_error( error, "Unable to read results" ); // Create the reference results memset( referenceBuffer, 0, numLoads * typeSize * vecSize * sizeof(cl_char)); for( i = 0; i < numLoads; i++ ) { memcpy( referenceBuffer + i * typeSize * vecSize, ( (char *)(void *)inBuffer ) + ( ( offsets[ i ] * vecSize ) + alignmentOffsets[ i ] ) * typeSize, typeSize * vecSize ); } // Validate the results now char *expected = referenceBuffer; char *actual = outBuffer; char *in = (char *)(void *)inBuffer; if (DEBUG) { log_info("Memory contents:\n"); for (i=0; i 10240 ) localSize = 10240; if (localSize > 4096) localSize -= 2048; else localSize /= 2; return test_vloadset( device, context, queue, create_local_load_code, (size_t)localSize ); } void create_constant_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) { const char *pattern = "%s%s" "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" "{\n" " int tid = get_global_id( 0 );\n" " %s%d tmp = vload%d( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n" " results[ tid ] = tmp;\n" "}\n"; const char *patternV3 = "%s%s" "__kernel void test_fn( __constant %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" "{\n" " int tid = get_global_id( 0 );\n" " %s3 tmp = vload3( offsets[ tid ], ( (__constant %s *) src ) + alignmentOffsets[ tid ] );\n" " results[ 3*tid ] = tmp.s0;\n" " results[ 3*tid+1 ] = tmp.s1;\n" " results[ 3*tid+2 ] = tmp.s2;\n" "}\n"; const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { sprintf( destBuffer, patternV3, type == kDouble ? doubleExtensionPragma : "", "", typeName, typeName, typeName, typeName ); } else { sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", "", typeName, typeName, (int)outVectorSize, typeName, (int)inVectorSize, (int)inVectorSize, typeName ); } } int test_vload_constant(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { // Determine the max size of a local buffer that we can test against cl_ulong maxSize; int error = clGetDeviceInfo( device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof( maxSize ), &maxSize, NULL ); test_error( error, "Unable to get max size of constant memory buffer" ); if( maxSize > 10240 ) maxSize = 10240; if (maxSize > 4096) maxSize -= 2048; else maxSize /= 2; return test_vloadset( device, context, queue, create_constant_load_code, (size_t)maxSize ); } void create_private_load_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize, size_t outVectorSize ) { const char *pattern = "%s%s" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "#define PRIV_TYPE %s%d\n" "#define PRIV_SIZE %d\n" "__kernel void test_fn( __global %s%d *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s%d *results )\n" "{\n" " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n" " int tid = get_global_id( 0 );\n" "\n" " for( int i = 0; i < %d; i++ )\n" " sPrivateStorage[ i ] = src[ i ];\n" // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for // anybody else to sync up "\n" " %s%d tmp = vload%d( offsets[ tid ], ( (__private %s *) sPrivateStorage ) + alignmentOffsets[ tid ] );\n" " results[ tid ] = tmp;\n" "}\n"; const char *patternV3 = "%s%s" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "#define PRIV_TYPE %s\n" "#define PRIV_SIZE %d\n" "__kernel void test_fn( __global %s *src, __global uint *offsets, __global uint *alignmentOffsets, __global %s *results )\n" "{\n" " __private PRIV_TYPE sPrivateStorage[ PRIV_SIZE ];\n" " int tid = get_global_id( 0 );\n" "\n" " for( int i = 0; i < PRIV_SIZE; i++ )\n" " {\n" " sPrivateStorage[ i ] = src[ i ];\n" " }\n" // Note: unlike the local test, each thread runs the above copy loop independently, so nobody needs to wait for // anybody else to sync up "\n" " %s3 tmp = vload3( offsets[ tid ], ( sPrivateStorage ) + alignmentOffsets[ tid ] );\n" " results[ 3*tid ] = tmp.s0;\n" " results[ 3*tid+1 ] = tmp.s1;\n" " results[ 3*tid+2 ] = tmp.s2;\n" "}\n"; const char *typeName = get_explicit_type_name(type); if(inVectorSize ==3) { sprintf( destBuffer, patternV3, type == kDouble ? doubleExtensionPragma : "", "", typeName, 3*((int)inBufferSize), typeName, typeName, typeName ); // log_info("Src is \"\n%s\n\"\n", destBuffer); } else { sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", "", typeName, (int)inVectorSize, (int)inBufferSize, typeName, (int)inVectorSize, typeName, (int)outVectorSize, (int)inBufferSize, typeName, (int)inVectorSize, (int)inVectorSize, typeName ); } } int test_vload_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { // We have no idea how much actual private storage is available, so just pick a reasonable value, // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes return test_vloadset( device, context, queue, create_private_load_code, 256 ); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// #pragma mark -------------------- vstore harness -------------------------- typedef void (*create_vstore_program_fn)( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ); int test_vstore( cl_device_id device, cl_context context, cl_command_queue queue, ExplicitType type, unsigned int vecSize, create_vstore_program_fn createFn, size_t bufferSize, MTdata d ) { int error; clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[ 3 ]; size_t threads[ 1 ], localThreads[ 1 ]; size_t numElements, typeSize, numStores = (DEBUG) ? 16 : NUM_LOADS; if (DEBUG) bufferSize = (bufferSize < 128) ? bufferSize : 128; typeSize = get_explicit_type_size( type ); numElements = bufferSize / ( typeSize * vecSize ); bufferSize = numElements * typeSize * vecSize; // To account for rounding if( numStores > numElements * 2 / 3 ) { // Note: unlike load, we have to restrict the # of stores here, since all offsets must be unique for our test // (Plus, we leave some room for extra values to make sure didn't get written) numStores = numElements * 2 / 3; if( numStores < 1 ) numStores = 1; } if (DEBUG) log_info("Testing: numStores: %d, typeSize: %d, vecSize: %d, numElements: %d, bufferSize: %d\n", (int)numStores, (int)typeSize, vecSize, (int)numElements, (int)bufferSize); #if !(defined(_WIN32) && defined(_MSC_VER)) cl_uint offsets[ numStores ]; #else cl_uint* offsets = (cl_uint*)_malloca(numStores * sizeof(cl_uint)); #endif char programSrc[ 10240 ]; size_t i; #if !(defined(_WIN32) && defined(_MSC_VER)) char inBuffer[ numStores * typeSize * vecSize ]; #else char* inBuffer = (char*)_malloca( numStores * typeSize * vecSize * sizeof(cl_char)); #endif clProtectedArray outBuffer( numElements * typeSize * vecSize ); #if !(defined(_WIN32) && defined(_MSC_VER)) char referenceBuffer[ numElements * typeSize * vecSize ]; #else char* referenceBuffer = (char*)_malloca(numElements * typeSize * vecSize * sizeof(cl_char)); #endif // Create some random input data and random offsets to load from generate_random_data( type, numStores * vecSize, d, (void *)inBuffer ); // Note: make sure no two offsets are the same, otherwise the output would depend on // the order that threads ran in, and that would be next to impossible to verify #if !(defined(_WIN32) && defined(_MSC_VER)) char flags[ numElements ]; #else char* flags = (char*)_malloca( numElements * sizeof(char)); #endif memset( flags, 0, numElements * sizeof(char) ); for( i = 0; i < numStores; i++ ) { do { offsets[ i ] = (cl_uint)random_in_range( 0, (int)numElements - 2, d ); // Note: keep it one vec below the end for offset testing } while( flags[ offsets[ i ] ] != 0 ); flags[ offsets[ i ] ] = -1; if (LINEAR_OFFSETS) offsets[i] = (int)i; } if (LINEAR_OFFSETS) log_info("Offsets set to thread IDs to simplify output.\n"); createFn( programSrc, numElements, type, vecSize ); // Create our kernel const char *ptr = programSrc; error = create_single_kernel_helper( context, &program, &kernel, 1, &ptr, "test_fn" ); test_error( error, "Unable to create testing kernel" ); if (DEBUG) log_info("Kernel: \n%s\n", programSrc); // Get the number of args to differentiate the kernels with local storage. (They have 5) cl_uint numArgs; error = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(numArgs), &numArgs, NULL); test_error( error, "clGetKernelInfo failed"); // Set up parameters streams[ 0 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * typeSize * vecSize * sizeof(cl_char), (void *)inBuffer, &error ); test_error( error, "Unable to create kernel stream" ); streams[ 1 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numStores * sizeof(cl_uint), offsets, &error ); test_error( error, "Unable to create kernel stream" ); streams[ 2 ] = clCreateBuffer( context, CL_MEM_COPY_HOST_PTR, numElements * typeSize * vecSize, (void *)outBuffer, &error ); test_error( error, "Unable to create kernel stream" ); // Set parameters and run if (numArgs == 5) { // We need to set the size of the local storage error = clSetKernelArg(kernel, 0, bufferSize, NULL); test_error( error, "clSetKernelArg for buffer failed"); for( i = 0; i < 3; i++ ) { error = clSetKernelArg( kernel, (int)i+1, sizeof( streams[ i ] ), &streams[ i ] ); test_error( error, "Unable to set kernel argument" ); } } else { // No local storage for( i = 0; i < 3; i++ ) { error = clSetKernelArg( kernel, (int)i, sizeof( streams[ i ] ), &streams[ i ] ); if (error) log_info("%s\n", programSrc); test_error( error, "Unable to set kernel argument" ); } } threads[ 0 ] = numStores; error = get_max_common_work_group_size( context, kernel, threads[ 0 ], &localThreads[ 0 ] ); test_error( error, "Unable to get local thread size" ); // Run in a loop, changing the address offset from 0 to ( vecSize - 1 ) each time, since // otherwise stores might overlap each other, and it'd be a nightmare to test! for( cl_uint addressOffset = 0; addressOffset < vecSize; addressOffset++ ) { if (DEBUG) log_info("\tstore addressOffset is %d, executing with threads %d\n", addressOffset, (int)threads[0]); // Clear the results first memset( outBuffer, 0, numElements * typeSize * vecSize ); error = clEnqueueWriteBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL ); test_error( error, "Unable to erase result stream" ); // Set up the new offset and run if (numArgs == 5) error = clSetKernelArg( kernel, 3+1, sizeof( cl_uint ), &addressOffset ); else error = clSetKernelArg( kernel, 3, sizeof( cl_uint ), &addressOffset ); test_error( error, "Unable to set address offset argument" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to exec kernel" ); // Get the results error = clEnqueueReadBuffer( queue, streams[ 2 ], CL_TRUE, 0, numElements * typeSize * vecSize, (void *)outBuffer, 0, NULL, NULL ); test_error( error, "Unable to read results" ); // Create the reference results memset( referenceBuffer, 0, numElements * typeSize * vecSize * sizeof(cl_char) ); for( i = 0; i < numStores; i++ ) { memcpy( referenceBuffer + ( ( offsets[ i ] * vecSize ) + addressOffset ) * typeSize, inBuffer + i * typeSize * vecSize, typeSize * vecSize ); } // Validate the results now char *expected = referenceBuffer; char *actual = (char *)(void *)outBuffer; if (DEBUG) { log_info("Memory contents:\n"); for (i=0; i>2) ], offsets[ tid ], destBuffer + alignmentOffset );\n" " } else {\n" " vstore3( vload3(tid, (__global %s *)srcValues), offsets[ tid ], destBuffer + alignmentOffset );\n" " }\n" "}\n"; const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { sprintf( destBuffer, patternV3, type == kDouble ? doubleExtensionPragma : "", typeName, typeName, typeName); } else { sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", typeName, (int)inVectorSize, typeName, (int)inVectorSize ); } // if(inVectorSize == 3 || inVectorSize == 4) { // log_info("\n----\n%s\n----\n", destBuffer); // } } int test_vstore_global(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { return test_vstoreset( device, context, queue, create_global_store_code, 10240 ); } void create_local_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ) { const char *pattern = "%s" "\n" "__kernel void test_fn(__local %s%d *sSharedStorage, __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n" "{\n" " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. " sSharedStorage[ offsets[tid] ] = (%s%d)(%s)0;\n" " sSharedStorage[ offsets[tid] +1 ] = sSharedStorage[ offsets[tid] ];\n" " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__local %s *)sSharedStorage ) + alignmentOffset );\n" "\n" // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " int i;\n" " __local %s *sp = (__local %s*) (sSharedStorage + offsets[tid]) + alignmentOffset;\n" " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n" " for( i = 0; (size_t)i < sizeof( sSharedStorage[0]) / sizeof( *sp ); i++ ) \n" " dp[i] = sp[i];\n" "}\n"; const char *patternV3 = "%s" "\n" "__kernel void test_fn(__local %s *sSharedStorage, __global %s *srcValues, __global uint *offsets, __global %s *destBuffer, uint alignmentOffset )\n" "{\n" " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. " sSharedStorage[ 3*offsets[tid] ] = (%s)0;\n" " sSharedStorage[ 3*offsets[tid] +1 ] = \n" " sSharedStorage[ 3*offsets[tid] ];\n" " sSharedStorage[ 3*offsets[tid] +2 ] = \n" " sSharedStorage[ 3*offsets[tid]];\n" " sSharedStorage[ 3*offsets[tid] +3 ] = \n" " sSharedStorage[ 3*offsets[tid]];\n" " sSharedStorage[ 3*offsets[tid] +4 ] = \n" " sSharedStorage[ 3*offsets[tid] ];\n" " sSharedStorage[ 3*offsets[tid] +5 ] = \n" " sSharedStorage[ 3*offsets[tid]];\n" " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" " vstore3( vload3(tid,srcValues), offsets[ tid ], sSharedStorage + alignmentOffset );\n" "\n" // Note: Once all threads are done vstore'ing into our shared storage, we then copy into the global output // buffer, but we have to make sure ALL threads are done vstore'ing before we do the copy " barrier( CLK_LOCAL_MEM_FENCE );\n" "\n" // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " int i;\n" " __local %s *sp = (sSharedStorage + 3*offsets[tid]) + alignmentOffset;\n" " __global %s *dp = (destBuffer + 3*offsets[tid]) + alignmentOffset;\n" " for( i = 0; i < 3; i++ ) \n" " dp[i] = sp[i];\n" "}\n"; const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { sprintf( destBuffer, patternV3, type == kDouble ? doubleExtensionPragma : "", typeName, typeName, typeName, typeName, typeName, typeName, typeName ); } else { sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, typeName, typeName, typeName, typeName ); } // log_info(destBuffer); } int test_vstore_local(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { // Determine the max size of a local buffer that we can test against cl_ulong localSize; int error = clGetDeviceInfo( device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof( localSize ), &localSize, NULL ); test_error( error, "Unable to get max size of local memory buffer" ); if( localSize > 10240 ) localSize = 10240; if (localSize > 4096) localSize -= 2048; else localSize /= 2; return test_vstoreset( device, context, queue, create_local_store_code, (size_t)localSize ); } void create_private_store_code( char *destBuffer, size_t inBufferSize, ExplicitType type, size_t inVectorSize ) { const char *pattern = "%s" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "\n" "__kernel void test_fn( __global %s%d *srcValues, __global uint *offsets, __global %s%d *destBuffer, uint alignmentOffset )\n" "{\n" " __private %s%d sPrivateStorage[ %d ];\n" " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. " sPrivateStorage[tid] = (%s%d)(%s)0;\n" "\n" " vstore%d( srcValues[ tid ], offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n" "\n" // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " uint i;\n" " __private %s *sp = (__private %s*) (sPrivateStorage + offsets[tid]) + alignmentOffset;\n" " __global %s *dp = (__global %s*) (destBuffer + offsets[tid]) + alignmentOffset;\n" " for( i = 0; i < sizeof( sPrivateStorage[0]) / sizeof( *sp ); i++ ) \n" " dp[i] = sp[i];\n" "}\n"; const char *patternV3 = "%s" // Private memory is unique per thread, unlike local storage which is unique per local work group. Which means // for this test, we have to copy the entire test buffer into private storage ON EACH THREAD to be an effective test "\n" "__kernel void test_fn( __global %s *srcValues, __global uint *offsets, __global %s3 *destBuffer, uint alignmentOffset )\n" "{\n" " __private %s3 sPrivateStorage[ %d ];\n" // keep this %d " int tid = get_global_id( 0 );\n" // We need to zero the shared storage since any locations we don't write to will have garbage otherwise. " sPrivateStorage[tid] = (%s3)(%s)0;\n" "\n" " vstore3( vload3(tid,srcValues), offsets[ tid ], ( (__private %s *)sPrivateStorage ) + alignmentOffset );\n" "\n" // Note: we only copy the relevant portion of our local storage over to the dest buffer, because // otherwise, local threads would be overwriting results from other local threads " uint i;\n" " __private %s *sp = ((__private %s*) sPrivateStorage) + 3*offsets[tid] + alignmentOffset;\n" " __global %s *dp = ((__global %s*) destBuffer) + 3*offsets[tid] + alignmentOffset;\n" " for( i = 0; i < 3; i++ ) \n" " dp[i] = sp[i];\n" "}\n"; const char *typeName = get_explicit_type_name(type); if(inVectorSize == 3) { sprintf( destBuffer, patternV3, type == kDouble ? doubleExtensionPragma : "", typeName, typeName, typeName, (int)inBufferSize, typeName, typeName, typeName, typeName, typeName, typeName, typeName ); } else { sprintf( destBuffer, pattern, type == kDouble ? doubleExtensionPragma : "", typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, (int)inVectorSize, (int)inBufferSize, typeName, (int)inVectorSize, typeName, (int)inVectorSize, typeName, typeName, typeName, typeName, typeName ); } } int test_vstore_private(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems ) { // We have no idea how much actual private storage is available, so just pick a reasonable value, // which is that we can fit at least two 16-element long, which is 2*8 bytes * 16 = 256 bytes return test_vstoreset( device, context, queue, create_private_store_code, 256 ); }