// // Copyright (c) 2017 The Khronos Group Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "testBase.h" #include "harness/typeWrappers.h" #include "harness/conversions.h" #include "harness/errorHelpers.h" const char *crossKernelSource_double = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double4 *sourceA, __global double4 *sourceB, __global double4 *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = cross( sourceA[tid], sourceB[tid] );\n" "\n" "}\n"; const char *crossKernelSource_doubleV3 = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double *sourceA, __global double *sourceB, __global double *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " vstore3( cross( vload3( tid, sourceA), vload3( tid, sourceB) ), tid, destValues);\n" "\n" "}\n"; const char *twoToFloatKernelPattern_double = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = %s( sourceA[tid], sourceB[tid] );\n" "\n" "}\n"; const char *twoToFloatKernelPattern_doubleV3 = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double%s *sourceB, __global double *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = %s( vload3( tid, (__global double*) sourceA), vload3( tid, (__global double*) sourceB ) );\n" "\n" "}\n"; const char *oneToFloatKernelPattern_double = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = %s( sourceA[tid] );\n" "\n" "}\n"; const char *oneToFloatKernelPattern_doubleV3 = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = %s( vload3( tid, (__global double*) sourceA) );\n" "\n" "}\n"; const char *oneToOneKernelPattern_double = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " destValues[tid] = %s( sourceA[tid] );\n" "\n" "}\n"; const char *oneToOneKernelPattern_doubleV3 = "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "__kernel void sample_test(__global double%s *sourceA, __global double%s *destValues)\n" "{\n" " int tid = get_global_id(0);\n" " vstore3( %s( vload3( tid, (__global double*) sourceA) ), tid, (__global double*) destValues );\n" "\n" "}\n"; #define TEST_SIZE (1 << 20) double verifyLength_double( double *srcA, size_t vecSize ); double verifyDistance_double( double *srcA, double *srcB, size_t vecSize ); void vector2string_double( char *string, double *vector, size_t elements ) { *string++ = '{'; *string++ = ' '; string += sprintf( string, "%a", vector[0] ); size_t i; for( i = 1; i < elements; i++ ) string += sprintf( string, ", %a", vector[i] ); *string++ = ' '; *string++ = '}'; *string = '\0'; } void fillWithTrickyNumbers_double( double *aVectors, double *bVectors, size_t vecSize ) { static const cl_double trickyValues[] = { -FLT_EPSILON, FLT_EPSILON, MAKE_HEX_DOUBLE(0x1.0p511, 0x1L, 511), MAKE_HEX_DOUBLE(0x1.8p511, 0x18L, 507), MAKE_HEX_DOUBLE(0x1.0p512, 0x1L, 512), MAKE_HEX_DOUBLE(-0x1.0p511, -0x1L, 511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p512, -0x1L, 512), MAKE_HEX_DOUBLE(0x1.0p-511, 0x1L, -511), MAKE_HEX_DOUBLE(0x1.8p-511, 0x18L, -515), MAKE_HEX_DOUBLE(0x1.0p-512, 0x1L, -512), MAKE_HEX_DOUBLE(-0x1.0p-511, -0x1L, -511), MAKE_HEX_DOUBLE(-0x1.8p-511, -0x18L, -515), MAKE_HEX_DOUBLE(-0x1.0p-512, -0x1L, -512), DBL_MAX / 2., -DBL_MAX / 2., INFINITY, -INFINITY, 0., -0. }; static const size_t trickyCount = sizeof( trickyValues ) / sizeof( trickyValues[0] ); static const size_t stride[4] = {1, trickyCount, trickyCount*trickyCount, trickyCount*trickyCount*trickyCount }; size_t i, j, k; for( j = 0; j < vecSize; j++ ) for( k = 0; k < vecSize; k++ ) for( i = 0; i < trickyCount; i++ ) aVectors[ j + stride[j] * (i + k*trickyCount)*vecSize] = trickyValues[i]; if( bVectors ) { size_t copySize = vecSize * vecSize * trickyCount; memset( bVectors, 0, sizeof(double) * copySize ); memset( aVectors + copySize, 0, sizeof(double) * copySize ); memcpy( bVectors + copySize, aVectors, sizeof(double) * copySize ); } } void cross_product_double( const double *vecA, const double *vecB, double *outVector, double *errorTolerances, double ulpTolerance ) { outVector[ 0 ] = ( vecA[ 1 ] * vecB[ 2 ] ) - ( vecA[ 2 ] * vecB[ 1 ] ); outVector[ 1 ] = ( vecA[ 2 ] * vecB[ 0 ] ) - ( vecA[ 0 ] * vecB[ 2 ] ); outVector[ 2 ] = ( vecA[ 0 ] * vecB[ 1 ] ) - ( vecA[ 1 ] * vecB[ 0 ] ); outVector[ 3 ] = 0.0f; errorTolerances[ 0 ] = fmax( fabs( vecA[ 1 ] ), fmax( fabs( vecB[ 2 ] ), fmax( fabs( vecA[ 2 ] ), fabs( vecB[ 1 ] ) ) ) ); errorTolerances[ 1 ] = fmax( fabs( vecA[ 2 ] ), fmax( fabs( vecB[ 0 ] ), fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 2 ] ) ) ) ); errorTolerances[ 2 ] = fmax( fabs( vecA[ 0 ] ), fmax( fabs( vecB[ 1 ] ), fmax( fabs( vecA[ 1 ] ), fabs( vecB[ 0 ] ) ) ) ); errorTolerances[ 0 ] = errorTolerances[ 0 ] * errorTolerances[ 0 ] * ( ulpTolerance * FLT_EPSILON ); // This gives us max squared times ulp tolerance, i.e. the worst-case expected variance we could expect from this result errorTolerances[ 1 ] = errorTolerances[ 1 ] * errorTolerances[ 1 ] * ( ulpTolerance * FLT_EPSILON ); errorTolerances[ 2 ] = errorTolerances[ 2 ] * errorTolerances[ 2 ] * ( ulpTolerance * FLT_EPSILON ); } int test_geom_cross_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { cl_int error; cl_ulong maxAllocSize, maxGlobalMemSize; error = clGetDeviceInfo( deviceID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof( maxAllocSize ), &maxAllocSize, NULL ); error |= clGetDeviceInfo( deviceID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof( maxGlobalMemSize ), &maxGlobalMemSize, NULL ); test_error( error, "Unable to get device config" ); log_info("Device supports:\nCL_DEVICE_MAX_MEM_ALLOC_SIZE: %gMB\nCL_DEVICE_GLOBAL_MEM_SIZE: %gMB\n", maxGlobalMemSize/(1024.0*1024.0), maxAllocSize/(1024.0*1024.0)); if (maxGlobalMemSize > (cl_ulong)SIZE_MAX) { maxGlobalMemSize = (cl_ulong)SIZE_MAX; } unsigned int size; unsigned int bufSize; unsigned int adjustment; int vecsize; adjustment = 32*1024*1024; /* Try to allocate a bit less than the limits */ for(vecsize = 3; vecsize <= 4; ++vecsize) { /* Make sure we adhere to the maximum individual allocation size and global memory size limits. */ size = TEST_SIZE; bufSize = sizeof(cl_double) * TEST_SIZE * vecsize; while ((bufSize > (maxAllocSize - adjustment)) || (3*bufSize > (maxGlobalMemSize - adjustment))) { size /= 2; bufSize = sizeof(cl_double) * size * vecsize; } /* Perform the test */ clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[3]; cl_double testVector[4]; int error, i; size_t threads[1], localThreads[1]; BufferOwningPtr A(malloc(bufSize)); BufferOwningPtr B(malloc(bufSize)); BufferOwningPtr C(malloc(bufSize)); cl_double *inDataA = A; cl_double *inDataB = B; cl_double *outData = C; /* Create kernels */ if( create_single_kernel_helper( context, &program, &kernel, 1, vecsize == 3 ? &crossKernelSource_doubleV3 : &crossKernelSource_double, "sample_test" ) ) return -1; /* Generate some streams. Note: deliberately do some random data in w to verify that it gets ignored */ for( i = 0; i < size * vecsize; i++ ) { inDataA[ i ] = get_random_double( -512.f, 512.f, d ); inDataB[ i ] = get_random_double( -512.f, 512.f, d ); } fillWithTrickyNumbers_double( inDataA, inDataB, vecsize ); streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize, inDataA, NULL); if( streams[0] == NULL ) { log_error("ERROR: Creating input array A failed!\n"); return -1; } streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, bufSize, inDataB, NULL); if( streams[1] == NULL ) { log_error("ERROR: Creating input array B failed!\n"); return -1; } streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); if( streams[2] == NULL ) { log_error("ERROR: Creating output array failed!\n"); return -1; } /* Assign streams and execute */ for( i = 0; i < 3; i++ ) { error = clSetKernelArg(kernel, i, sizeof( streams[i] ), &streams[i]); test_error( error, "Unable to set indexed kernel arguments" ); } /* Run the kernel */ threads[0] = size; error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); test_error( error, "Unable to get work group size to use" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to execute test kernel" ); /* Now get the results */ error = clEnqueueReadBuffer( queue, streams[2], true, 0, bufSize, outData, 0, NULL, NULL ); test_error( error, "Unable to read output array!" ); /* And verify! */ for( i = 0; i < size; i++ ) { double errorTolerances[ 4 ]; // On an embedded device w/ round-to-zero, 3 ulps is the worst-case tolerance for cross product cross_product_double( inDataA + i * vecsize, inDataB + i * vecsize, testVector, errorTolerances, 3.f ); double errs[] = { fabs( testVector[ 0 ] - outData[ i * vecsize + 0 ] ), fabs( testVector[ 1 ] - outData[ i * vecsize + 1 ] ), fabs( testVector[ 2 ] - outData[ i * vecsize + 2 ] ) }; if( errs[ 0 ] > errorTolerances[ 0 ] || errs[ 1 ] > errorTolerances[ 1 ] || errs[ 2 ] > errorTolerances[ 2 ] ) { log_error( "ERROR: Data sample %d does not validate! Expected (%a,%a,%a,%a), got (%a,%a,%a,%a)\n", i, testVector[0], testVector[1], testVector[2], testVector[3], outData[i*vecsize], outData[i*vecsize+1], outData[i*vecsize+2], outData[i*vecsize+3] ); log_error( " Input: (%a %a %a) and (%a %a %a)\n", inDataA[ i * vecsize + 0 ], inDataA[ i * vecsize + 1 ], inDataA[ i * vecsize + 2 ], inDataB[ i * vecsize + 0 ], inDataB[ i * vecsize + 1 ], inDataB[ i * vecsize + 2 ] ); log_error( " Errors: (%a out of %a), (%a out of %a), (%a out of %a)\n", errs[ 0 ], errorTolerances[ 0 ], errs[ 1 ], errorTolerances[ 1 ], errs[ 2 ], errorTolerances[ 2 ] ); log_error(" ulp %g\n", Ulp_Error_Double( outData[ i * vecsize + 1 ], testVector[ 1 ] ) ); return -1; } } } return 0; } double getMaxValue_double( double vecA[], double vecB[], size_t vecSize ) { double a = fmax( fabs( vecA[ 0 ] ), fabs( vecB[ 0 ] ) ); for( size_t i = 1; i < vecSize; i++ ) a = fmax( fabs( vecA[ i ] ), fmax( fabs( vecB[ i ] ), a ) ); return a; } typedef double (*twoToFloatVerifyFn_double)( double *srcA, double *srcB, size_t vecSize ); int test_twoToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName, size_t vecSize, twoToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d ) { clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[3]; int error; size_t i, threads[1], localThreads[1]; char kernelSource[10240]; char *programPtr; char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; BufferOwningPtr A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize)); BufferOwningPtr B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize)); BufferOwningPtr C(malloc(sizeof(cl_double) * TEST_SIZE)); cl_double *inDataA = A; cl_double *inDataB = B; cl_double *outData = C; /* Create the source */ sprintf( kernelSource, vecSize == 3 ? twoToFloatKernelPattern_doubleV3 : twoToFloatKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName ); /* Create kernels */ programPtr = kernelSource; if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) ) return -1; /* Generate some streams */ for( i = 0; i < TEST_SIZE * vecSize; i++ ) { inDataA[ i ] = any_double(d); inDataB[ i ] = any_double(d); } fillWithTrickyNumbers_double( inDataA, inDataB, vecSize ); streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL); if( streams[0] == NULL ) { log_error("ERROR: Creating input array A failed!\n"); return -1; } streams[1] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_double) * vecSize * TEST_SIZE, inDataB, NULL); if( streams[1] == NULL ) { log_error("ERROR: Creating input array B failed!\n"); return -1; } streams[2] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double) * TEST_SIZE, NULL, NULL); if( streams[2] == NULL ) { log_error("ERROR: Creating output array failed!\n"); return -1; } /* Assign streams and execute */ for( i = 0; i < 3; i++ ) { error = clSetKernelArg(kernel, (int)i, sizeof( streams[i] ), &streams[i]); test_error( error, "Unable to set indexed kernel arguments" ); } /* Run the kernel */ threads[0] = TEST_SIZE; error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); test_error( error, "Unable to get work group size to use" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to execute test kernel" ); /* Now get the results */ error = clEnqueueReadBuffer( queue, streams[2], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL ); test_error( error, "Unable to read output array!" ); /* And verify! */ for( i = 0; i < TEST_SIZE; i++ ) { double expected = verifyFn( inDataA + i * vecSize, inDataB + i * vecSize, vecSize ); if( (double) expected != outData[ i ] ) { if( isnan(expected) && isnan( outData[i] ) ) continue; if( ulpLimit < 0 ) { // Limit below zero means we need to test via a computed error (like cross product does) double maxValue = getMaxValue_double( inDataA + i * vecSize, inDataB + i * vecSize, vecSize ); // In this case (dot is the only one that gets here), the ulp is 2*vecSize - 1 (n + n-1 max # of errors) double errorTolerance = maxValue * maxValue * ( 2.f * (double)vecSize - 1.f ) * FLT_EPSILON; // Limit below zero means test via epsilon instead double error = fabs( (double)expected - (double)outData[ i ] ); if( error > errorTolerance ) { log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) error of %g against tolerance %g\n", (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], inDataB[i*vecSize], (double)error, (double)errorTolerance ); char vecA[1000], vecB[1000]; vector2string_double( vecA, inDataA + i * vecSize, vecSize ); vector2string_double( vecB, inDataB + i * vecSize, vecSize ); log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB ); return -1; } } else { double error = Ulp_Error_Double( outData[ i ], expected ); if( fabs(error) > ulpLimit ) { log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), sources (%a and %a) ulp of %f\n", (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], inDataB[i*vecSize], error ); char vecA[1000], vecB[1000]; vector2string_double( vecA, inDataA + i * vecSize, vecSize ); vector2string_double( vecB, inDataB + i * vecSize, vecSize ); log_error( "\tvector A: %s\n\tvector B: %s\n", vecA, vecB ); return -1; } } } } return 0; } double verifyDot_double( double *srcA, double *srcB, size_t vecSize ) { double total = 0.f; for( unsigned int i = 0; i < vecSize; i++ ) total += (double)srcA[ i ] * (double)srcB[ i ]; return total; } int test_geom_dot_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; for( size = 0; sizes[ size ] != 0 ; size++ ) { if( test_twoToFloat_kernel_double( queue, context, "dot", sizes[ size ], verifyDot_double, -1.0f /*magic value*/, d ) != 0 ) { log_error( " dot double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } } return retVal; } int test_geom_fast_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; abort(); //there is no double precision fast_distance for( size = 0; sizes[ size ] != 0 ; size++ ) { double maxUlps = 8192.0f + // error in sqrt 0.5f * // effect on e of taking sqrt( x + e ) ( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication 0.5f * (double) (sizes[size]-1)); // cumulative error for additions if( test_twoToFloat_kernel_double( queue, context, "fast_distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 ) { log_error( " fast_distance double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } else { log_info( " fast_distance double vector size %d passed\n", (int)sizes[ size ] ); } } return retVal; } double verifyDistance_double( double *srcA, double *srcB, size_t vecSize ) { unsigned int i; double diff[4]; for( i = 0; i < vecSize; i++ ) diff[i] = srcA[i] - srcB[i]; return verifyLength_double( diff, vecSize ); } int test_geom_distance_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; for( size = 0; sizes[ size ] != 0 ; size++ ) { double maxUlps = 3.0f + // error in sqrt 0.5f * // effect on e of taking sqrt( x + e ) ( 1.5f * (double) sizes[size] + // cumulative error for multiplications (a-b+0.5ulp)**2 = (a-b)**2 + a*0.5ulp + b*0.5 ulp + 0.5 ulp for multiplication 0.5f * (double) (sizes[size]-1)); // cumulative error for additions maxUlps *= 2.0; // our reference code may be in error too if( test_twoToFloat_kernel_double( queue, context, "distance", sizes[ size ], verifyDistance_double, maxUlps, d ) != 0 ) { log_error( " distance double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } else { log_info( " distance double vector size %d passed\n", (int)sizes[ size ] ); } } return retVal; } typedef double (*oneToFloatVerifyFn_double)( double *srcA, size_t vecSize ); int test_oneToFloat_kernel_double(cl_command_queue queue, cl_context context, const char *fnName, size_t vecSize, oneToFloatVerifyFn_double verifyFn, double ulpLimit, MTdata d ) { clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[2]; BufferOwningPtr A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize)); BufferOwningPtr B(malloc(sizeof(cl_double) * TEST_SIZE)); int error; size_t i, threads[1], localThreads[1]; char kernelSource[10240]; char *programPtr; char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; cl_double *inDataA = A; cl_double *outData = B; /* Create the source */ sprintf( kernelSource, vecSize == 3 ? oneToFloatKernelPattern_doubleV3 : oneToFloatKernelPattern_double, sizeNames[vecSize-1], fnName ); /* Create kernels */ programPtr = kernelSource; if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) ) return -1; /* Generate some streams */ for( i = 0; i < TEST_SIZE * vecSize; i++ ) inDataA[ i ] = any_double(d); fillWithTrickyNumbers_double( inDataA, NULL, vecSize ); streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL); if( streams[0] == NULL ) { log_error("ERROR: Creating input array A failed!\n"); return -1; } streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double) * TEST_SIZE, NULL, NULL); if( streams[1] == NULL ) { log_error("ERROR: Creating output array failed!\n"); return -1; } /* Assign streams and execute */ error = clSetKernelArg( kernel, 0, sizeof( streams[ 0 ] ), &streams[0] ); test_error( error, "Unable to set indexed kernel arguments" ); error = clSetKernelArg( kernel, 1, sizeof( streams[ 1 ] ), &streams[1] ); test_error( error, "Unable to set indexed kernel arguments" ); /* Run the kernel */ threads[0] = TEST_SIZE; error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); test_error( error, "Unable to get work group size to use" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to execute test kernel" ); /* Now get the results */ error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE, outData, 0, NULL, NULL ); test_error( error, "Unable to read output array!" ); /* And verify! */ for( i = 0; i < TEST_SIZE; i++ ) { double expected = verifyFn( inDataA + i * vecSize, vecSize ); if( (double) expected != outData[ i ] ) { double ulps = Ulp_Error_Double( outData[i], expected ); if( fabs( ulps ) <= ulpLimit ) continue; // We have to special case NAN if( isnan( outData[ i ] ) && isnan( expected ) ) continue; if(! (fabs(ulps) < ulpLimit) ) { log_error( "ERROR: Data sample %d at size %d does not validate! Expected (%a), got (%a), source (%a), ulp %f\n", (int)i, (int)vecSize, expected, outData[ i ], inDataA[i*vecSize], ulps ); char vecA[1000]; vector2string_double( vecA, inDataA + i * vecSize, vecSize ); log_error( "\tvector: %s", vecA ); return -1; } } } return 0; } double verifyLength_double( double *srcA, size_t vecSize ) { double total = 0; unsigned int i; // We calculate the distance as a double, to try and make up for the fact that // the GPU has better precision distance since it's a single op for( i = 0; i < vecSize; i++ ) total += srcA[i] * srcA[i]; // Deal with spurious overflow if( total == INFINITY ) { total = 0.0; for( i = 0; i < vecSize; i++ ) { double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p-600, 0x1LL, -600); total += f * f; } return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p600, 0x1LL, 600); } // Deal with spurious underflow if( total < 4 /*max vector length*/ * DBL_MIN / DBL_EPSILON ) { total = 0.0; for( i = 0; i < vecSize; i++ ) { double f = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700); total += f * f; } return sqrt( total ) * MAKE_HEX_DOUBLE(0x1.0p-700, 0x1LL, -700); } return sqrt( total ); } int test_geom_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; for( size = 0; sizes[ size ] != 0 ; size++ ) { double maxUlps = 3.0f + // error in sqrt 0.5f * // effect on e of taking sqrt( x + e ) ( 0.5f * (double) sizes[size] + // cumulative error for multiplications 0.5f * (double) (sizes[size]-1)); // cumulative error for additions maxUlps *= 2.0; // our reference code may be in error too if( test_oneToFloat_kernel_double( queue, context, "length", sizes[ size ], verifyLength_double, maxUlps, d ) != 0 ) { log_error( " length double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } else { log_info( " length double vector size %d passed\n", (int)sizes[ size ] ); } } return retVal; } double verifyFastLength_double( double *srcA, size_t vecSize ) { double total = 0; unsigned int i; // We calculate the distance as a double, to try and make up for the fact that // the GPU has better precision distance since it's a single op for( i = 0; i < vecSize; i++ ) { total += (double)srcA[i] * (double)srcA[i]; } return sqrt( total ); } int test_geom_fast_length_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; abort(); //there is no double precision fast_length for( size = 0; sizes[ size ] != 0 ; size++ ) { double maxUlps = 8192.0f + // error in half_sqrt 0.5f * // effect on e of taking sqrt( x + e ) ( 0.5f * (double) sizes[size] + // cumulative error for multiplications 0.5f * (double) (sizes[size]-1)); // cumulative error for additions if( test_oneToFloat_kernel_double( queue, context, "fast_length", sizes[ size ], verifyFastLength_double, maxUlps, d ) != 0 ) { log_error( " fast_length double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } else { log_info( " fast_length double vector size %d passed\n", (int)sizes[ size ] ); } } return retVal; } typedef void (*oneToOneVerifyFn_double)( double *srcA, double *dstA, size_t vecSize ); int test_oneToOne_kernel_double(cl_command_queue queue, cl_context context, const char *fnName, size_t vecSize, oneToOneVerifyFn_double verifyFn, double ulpLimit, MTdata d ) { clProgramWrapper program; clKernelWrapper kernel; clMemWrapper streams[2]; BufferOwningPtr A(malloc(sizeof(cl_double) * TEST_SIZE * vecSize)); BufferOwningPtr B(malloc(sizeof(cl_double) * TEST_SIZE * vecSize)); int error; size_t i, j, threads[1], localThreads[1]; char kernelSource[10240]; char *programPtr; char sizeNames[][4] = { "", "2", "3", "4", "", "", "", "8", "", "", "", "", "", "", "", "16" }; cl_double *inDataA = A; cl_double *outData = B; /* Create the source */ sprintf( kernelSource, vecSize == 3 ? oneToOneKernelPattern_doubleV3 : oneToOneKernelPattern_double, sizeNames[vecSize-1], sizeNames[vecSize-1], fnName ); /* Create kernels */ programPtr = kernelSource; if( create_single_kernel_helper( context, &program, &kernel, 1, (const char **)&programPtr, "sample_test" ) ) return -1; /* initialize data */ memset( inDataA, 0, vecSize * sizeof( cl_double ) ); for( i = vecSize; i < TEST_SIZE * vecSize; i++ ) inDataA[ i ] = any_double(d); streams[0] = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_double) * vecSize * TEST_SIZE, inDataA, NULL); if( streams[0] == NULL ) { log_error("ERROR: Creating input array A failed!\n"); return -1; } streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_double) * vecSize * TEST_SIZE, NULL, NULL); if( streams[1] == NULL ) { log_error("ERROR: Creating output array failed!\n"); return -1; } /* Assign streams and execute */ error = clSetKernelArg(kernel, 0, sizeof( streams[0] ), &streams[0] ); test_error( error, "Unable to set indexed kernel arguments" ); error = clSetKernelArg(kernel, 1, sizeof( streams[1] ), &streams[1] ); test_error( error, "Unable to set indexed kernel arguments" ); /* Run the kernel */ threads[0] = TEST_SIZE; error = get_max_common_work_group_size( context, kernel, threads[0], &localThreads[0] ); test_error( error, "Unable to get work group size to use" ); error = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, threads, localThreads, 0, NULL, NULL ); test_error( error, "Unable to execute test kernel" ); /* Now get the results */ error = clEnqueueReadBuffer( queue, streams[1], true, 0, sizeof( cl_double ) * TEST_SIZE * vecSize, outData, 0, NULL, NULL ); test_error( error, "Unable to read output array!" ); /* And verify! */ for( i = 0; i < TEST_SIZE; i++ ) { double expected[4]; verifyFn( inDataA + i * vecSize, expected, vecSize ); for( j = 0; j < vecSize; j++ ) { // We have to special case NAN if( isnan( outData[ i * vecSize + j ] ) && isnan( expected[ j ] ) ) continue; if( expected[j] != outData[ i *vecSize+j ] ) { double error = Ulp_Error_Double( outData[i*vecSize + j ], expected[ j ] ); if( fabs(error) > ulpLimit ) { log_error( "ERROR: Data sample {%d,%d} at size %d does not validate! Expected %12.24f (%a), got %12.24f (%a), ulp %f\n", (int)i, (int)j, (int)vecSize, expected[j], expected[j], outData[i*vecSize +j], outData[i*vecSize +j], error ); log_error( " Source: " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%g ", inDataA[ i * vecSize + q ] ); log_error( "\n : " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%a ", inDataA[ i * vecSize + q ] ); log_error( "\n" ); log_error( " Result: " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%g ", outData[i * vecSize + q ] ); log_error( "\n : " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%a ", outData[i * vecSize + q ] ); log_error( "\n" ); log_error( " Expected: " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%g ", expected[ q ] ); log_error( "\n : " ); for( size_t q = 0; q < vecSize; q++ ) log_error( "%a ", expected[ q ] ); log_error( "\n" ); return -1; } } } } return 0; } void verifyNormalize_double( double *srcA, double *dst, size_t vecSize ) { double total = 0, value; unsigned int i; // We calculate everything as a double, to try and make up for the fact that // the GPU has better precision distance since it's a single op for( i = 0; i < vecSize; i++ ) total += (double)srcA[i] * (double)srcA[i]; if( total < vecSize * DBL_MIN / DBL_EPSILON ) { //we may have incurred denormalization loss -- rescale total = 0; for( i = 0; i < vecSize; i++ ) { dst[i] = srcA[i] * MAKE_HEX_DOUBLE(0x1.0p700, 0x1LL, 700); //exact total += dst[i] * dst[i]; } //If still zero if( total == 0.0 ) { // Special edge case: copy vector over without change for( i = 0; i < vecSize; i++ ) dst[i] = srcA[i]; return; } srcA = dst; } else if( total == INFINITY ) { //we may have incurred spurious overflow double scale = MAKE_HEX_DOUBLE(0x1.0p-512, 0x1LL, -512) / vecSize; total = 0; for( i = 0; i < vecSize; i++ ) { dst[i] = srcA[i] * scale; //exact total += dst[i] * dst[i]; } // If there are infinities here, handle those if( total == INFINITY ) { total = 0; for( i = 0; i < vecSize; i++ ) { if( isinf(dst[i]) ) { dst[i] = copysign( 1.0, srcA[i] ); total += 1.0; } else dst[i] = copysign( 0.0, srcA[i] ); } } srcA = dst; } value = sqrt( total ); for( i = 0; i < vecSize; i++ ) dst[i] = srcA[i] / value; } int test_geom_normalize_double(cl_device_id deviceID, cl_context context, cl_command_queue queue, int num_elements, MTdata d) { size_t sizes[] = { 1, 2, 3, 4, 0 }; unsigned int size; int retVal = 0; for( size = 0; sizes[ size ] != 0 ; size++ ) { double maxUlps = 2.5f + // error in rsqrt + error in multiply 0.5f * // effect on e of taking sqrt( x + e ) ( 0.5f * (double) sizes[size] + // cumulative error for multiplications 0.5f * (double) (sizes[size]-1)); // cumulative error for additions maxUlps *= 2.0; //our reference code is not infinitely precise and may have error of its own if( test_oneToOne_kernel_double( queue, context, "normalize", sizes[ size ], verifyNormalize_double, maxUlps, d ) != 0 ) { log_error( " normalize double vector size %d FAILED\n", (int)sizes[ size ] ); retVal = -1; } else { log_info( " normalize double vector size %d passed\n", (int)sizes[ size ] ); } } return retVal; }