codepad
[
create a new paste
]
login
|
about
Language:
C
C++
D
Haskell
Lua
OCaml
PHP
Perl
Plain Text
Python
Ruby
Scheme
Tcl
#define __CL_ENABLE_EXCEPTIONS #include <cstdlib> #include <math.h> #include <iostream> #include <fstream> #include <CL/cl.h> #undef CL_VERSION_1_2 #include <CL/cl.hpp> using namespace std; #define BUILD_LOG 0 #define BENCH_SP 1 #define BENCH_DP 0 #define BENCH_INT 0 #define BENCH_BW 0 #define ITERATIONS 10 #define MAX(X, Y) \ (X > Y)? X: Y; #define MIN(X, Y) \ (X < Y)? X: Y; typedef struct { std::string deviceName; std::string driverVersion; unsigned numCUs; unsigned maxWGSize; unsigned maxAllocSize; unsigned maxGlobalSize; unsigned maxClockFreq; bool doubleSupported; cl_device_type deviceType; // Test specific options int gloalBWIters; int computeWgsPerCU; int computeIters; int transferBWIters; int kernelLatencyIters; } device_info_t; device_info_t getDeviceInfo(cl::Device &d) { device_info_t devInfo; devInfo.deviceName = d.getInfo<CL_DEVICE_NAME>(); devInfo.driverVersion = d.getInfo<CL_DRIVER_VERSION>(); devInfo.numCUs = (unsigned)d.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); vector<size_t> maxWIPerDim; maxWIPerDim = d.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>(); devInfo.maxWGSize = (unsigned)maxWIPerDim[0]; // Limiting max work-group size to 256 #define MAX_WG_SIZE 256 devInfo.maxWGSize = MIN(devInfo.maxWGSize, MAX_WG_SIZE); devInfo.maxAllocSize = (unsigned)d.getInfo<CL_DEVICE_MAX_MEM_ALLOC_SIZE>(); devInfo.maxGlobalSize = (unsigned)d.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(); devInfo.maxClockFreq = (unsigned)d.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); devInfo.doubleSupported = false; std::string extns = d.getInfo<CL_DEVICE_EXTENSIONS>(); if ((extns.find("cl_khr_fp64") != std::string::npos) || (extns.find("cl_amd_fp64") != std::string::npos)) { devInfo.doubleSupported = true; } devInfo.deviceType = d.getInfo<CL_DEVICE_TYPE>(); if (devInfo.deviceType & CL_DEVICE_TYPE_CPU) { devInfo.gloalBWIters = 20; devInfo.computeWgsPerCU = 512; devInfo.computeIters = 10; } else { // GPU devInfo.gloalBWIters = 50; devInfo.computeWgsPerCU = 512; devInfo.computeIters = 30; } devInfo.transferBWIters = 20; devInfo.kernelLatencyIters = 20000; return devInfo; } unsigned roundToPowOf2(unsigned number) { double logd = log(number) / log(2); logd = floor(logd); return (unsigned) pow(2, (int) logd); } unsigned roundToPowOf2(unsigned number, int maxPower) { double logd = log(number) / log(2); logd = floor(logd); if (maxPower > 0) { logd = MIN(logd, ((double )maxPower)); } return (unsigned) pow(2, (int) logd); } void populate(float *ptr, unsigned N) { srand((unsigned int) time(NULL)); for (int i = 0; i < (int) N; i++) { ptr[i] = (float) rand(); } } float timeInUS(cl::Event &timeEvent) { cl_ulong start = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_START>() / 1000; cl_ulong end = timeEvent.getProfilingInfo<CL_PROFILING_COMMAND_END>() / 1000; return (float)((int)end - (int)start); } float run_kernel(cl::CommandQueue &queue, cl::Kernel &kernel, cl::NDRange &globalSize, cl::NDRange &localSize, int iters) { float timed = 0; // Dummy calls queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize); queue.finish(); for (int i = 0; i < iters; i++) { cl::Event timeEvent; queue.enqueueNDRangeKernel(kernel, cl::NullRange, globalSize, localSize, NULL, &timeEvent); queue.finish(); timed += timeInUS(timeEvent); } return (timed / iters); } cl::Program compile(const char *filename, cl::Context context, cl::Device device) { std::ifstream source_file(filename); std::string source(std::istreambuf_iterator<char>(source_file), (std::istreambuf_iterator<char>())); source_file.close(); cl::Program program(context, source); program.build(); #if BUILD_LOG std::cout << "Build log: " << std::endl; std::cout << program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device); std::cout << std::endl; #endif return program; } void run_compute_sp(cl::Context context, cl::Device device, cl::CommandQueue queue, device_info_t devInfo, unsigned iters) { // Compile kernels cl::Program program = compile("compute_sp_kernels.cl", context, device); // Parameters unsigned workPerWI = 4096; // Flops executed per work-item unsigned globalWIs = (devInfo.numCUs) * (devInfo.computeWgsPerCU) * (devInfo.maxWGSize); unsigned maxWGSize = devInfo.maxWGSize; unsigned t = MIN((globalWIs * sizeof(cl_float)), devInfo.maxAllocSize); t = roundToPowOf2(t); globalWIs = t / sizeof(cl_float); cl_float A = 1.3f; float timed, gflops; // Kernel dimensions cl::NDRange globalSize = globalWIs; cl::NDRange localSize = maxWGSize; // Output buffer cl::Buffer outputBuf = cl::Buffer(context, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_float))); // Get kernels cl::Kernel kernel_v1(program, "compute_sp_v1"); kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A); cl::Kernel kernel_v2(program, "compute_sp_v2"); kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A); cl::Kernel kernel_v4(program, "compute_sp_v4"); kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A); cl::Kernel kernel_v8(program, "compute_sp_v8"); kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A); cl::Kernel kernel_v16(program, "compute_sp_v16"); kernel_v16.setArg(0, outputBuf), kernel_v16.setArg(1, A); // Run kernels // Vector width 1 std::cout << "float : "; timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 2 std::cout << "float2 : "; timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 4 std::cout << "float4 : "; timed = run_kernel(queue, kernel_v4, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 8 std::cout << "float8 : "; timed = run_kernel(queue, kernel_v8, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 16 std::cout << "float16 : "; timed = run_kernel(queue, kernel_v16, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; } void run_compute_dp(cl::Context context, cl::Device device, cl::CommandQueue queue, device_info_t devInfo, unsigned iters) { // Compile kernels cl::Program program = compile("compute_dp_kernels.cl", context, device); // Parameters unsigned workPerWI = 4096; // Flops executed per work-item unsigned globalWIs = (devInfo.numCUs) * (devInfo.computeWgsPerCU) * (devInfo.maxWGSize); unsigned maxWGSize = devInfo.maxWGSize; unsigned t = MIN((globalWIs * sizeof(cl_float)), devInfo.maxAllocSize); t = roundToPowOf2(t); globalWIs = t / sizeof(cl_float); cl_double A = 1.3f; float timed, gflops; // Output buffer cl::Buffer outputBuf = cl::Buffer(context, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_double))); // Kernel dimensions cl::NDRange globalSize = globalWIs; cl::NDRange localSize = maxWGSize; // Get kernels cl::Kernel kernel_v1(program, "compute_dp_v1"); kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A); cl::Kernel kernel_v2(program, "compute_dp_v2"); kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A); cl::Kernel kernel_v4(program, "compute_dp_v4"); kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A); cl::Kernel kernel_v8(program, "compute_dp_v8"); kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A); cl::Kernel kernel_v16(program, "compute_dp_v16"); kernel_v16.setArg(0, outputBuf), kernel_v16.setArg(1, A); // Run kernels // Vector width 1 std::cout << "double : "; timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 2 std::cout << "double2 : "; timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; } void run_compute_integer(cl::Context context, cl::Device device, cl::CommandQueue queue, device_info_t devInfo, unsigned iters) { // Compile kernels cl::Program program = compile("compute_integer_kernels.cl", context, device); // Parameters unsigned workPerWI = 4096; // Flops executed per work-item unsigned globalWIs = (devInfo.numCUs) * (devInfo.computeWgsPerCU) * (devInfo.maxWGSize); unsigned maxWGSize = devInfo.maxWGSize; unsigned t = MIN((globalWIs * sizeof(cl_float)), devInfo.maxAllocSize); t = roundToPowOf2(t); globalWIs = t / sizeof(cl_float); cl_int A = 1; float timed, gflops; // Output buffer cl::Buffer outputBuf = cl::Buffer(context, CL_MEM_WRITE_ONLY, (globalWIs * sizeof(cl_int))); // Kernel dimensions cl::NDRange globalSize = globalWIs; cl::NDRange localSize = maxWGSize; // Get kernels cl::Kernel kernel_v1(program, "compute_integer_v1"); kernel_v1.setArg(0, outputBuf), kernel_v1.setArg(1, A); cl::Kernel kernel_v2(program, "compute_integer_v2"); kernel_v2.setArg(0, outputBuf), kernel_v2.setArg(1, A); cl::Kernel kernel_v4(program, "compute_integer_v4"); kernel_v4.setArg(0, outputBuf), kernel_v4.setArg(1, A); cl::Kernel kernel_v8(program, "compute_integer_v8"); kernel_v8.setArg(0, outputBuf), kernel_v8.setArg(1, A); // Run kernels // Vector width 1 std::cout << "int : "; timed = run_kernel(queue, kernel_v1, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 2 std::cout << "int2 : "; timed = run_kernel(queue, kernel_v2, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 4 std::cout << "int4 : "; timed = run_kernel(queue, kernel_v4, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; // Vector width 8 std::cout << "int8 : "; timed = run_kernel(queue, kernel_v8, globalSize, localSize, iters); gflops = ((float)globalWIs * workPerWI) / timed / 1e3f; std::cout << gflops << std::endl; } void run_global_bandwidth(cl::Context context, cl::Device device, cl::CommandQueue queue, device_info_t devInfo, unsigned iters) { // Compile kernels cl::Program program = compile("global_bandwidth_kernels.cl", context, device); // Variables unsigned fetchPerWI = 16; cl_uint maxItems = devInfo.maxAllocSize / sizeof(float) / 2; cl_uint numItems; float timed, timed_lo, timed_go, timed_hi, gbps; // Set an upper-limit for cpu devies if(devInfo.deviceType & CL_DEVICE_TYPE_CPU) { numItems = roundToPowOf2(maxItems, 25); } else { numItems = roundToPowOf2(maxItems); } // Allocate buffers float *arr = new float[numItems]; populate(arr, numItems); cl::Buffer inputBuf = cl::Buffer(context, CL_MEM_READ_ONLY, (numItems * sizeof(float))); cl::Buffer outputBuf = cl::Buffer(context, CL_MEM_WRITE_ONLY, (numItems * sizeof(float))); queue.enqueueWriteBuffer(inputBuf, CL_TRUE, 0, (numItems * sizeof(float)), arr); // Kernel dimensions cl::NDRange globalSize; cl::NDRange localSize = devInfo.maxWGSize; // Get kernels cl::Kernel kernel_v1_lo(program, "global_bandwidth_v1_local_offset"); kernel_v1_lo.setArg(0, inputBuf), kernel_v1_lo.setArg(1, outputBuf); cl::Kernel kernel_v2_lo(program, "global_bandwidth_v2_local_offset"); kernel_v2_lo.setArg(0, inputBuf), kernel_v2_lo.setArg(1, outputBuf); cl::Kernel kernel_v4_lo(program, "global_bandwidth_v4_local_offset"); kernel_v4_lo.setArg(0, inputBuf), kernel_v4_lo.setArg(1, outputBuf); cl::Kernel kernel_v8_lo(program, "global_bandwidth_v8_local_offset"); kernel_v8_lo.setArg(0, inputBuf), kernel_v8_lo.setArg(1, outputBuf); cl::Kernel kernel_v16_lo(program, "global_bandwidth_v16_local_offset"); kernel_v16_lo.setArg(0, inputBuf), kernel_v16_lo.setArg(1, outputBuf); cl::Kernel kernel_v1_go(program, "global_bandwidth_v1_global_offset"); kernel_v1_go.setArg(0, inputBuf), kernel_v1_go.setArg(1, outputBuf); cl::Kernel kernel_v2_go(program, "global_bandwidth_v2_global_offset"); kernel_v2_go.setArg(0, inputBuf), kernel_v2_go.setArg(1, outputBuf); cl::Kernel kernel_v4_go(program, "global_bandwidth_v4_global_offset"); kernel_v4_go.setArg(0, inputBuf), kernel_v4_go.setArg(1, outputBuf); cl::Kernel kernel_v8_go(program, "global_bandwidth_v8_global_offset"); kernel_v8_go.setArg(0, inputBuf), kernel_v8_go.setArg(1, outputBuf); cl::Kernel kernel_v16_go(program, "global_bandwidth_v16_global_offset"); kernel_v16_go.setArg(0, inputBuf), kernel_v16_go.setArg(1, outputBuf); // Run 2 kind of bandwidth kernels // lo -- local_size offset - subsequent fetches at local_size offset // go -- global_size offset // Vector width 1 std::cout << "float : "; globalSize = numItems / fetchPerWI; timed_lo = run_kernel(queue, kernel_v1_lo, globalSize, localSize, iters); timed_go = run_kernel(queue, kernel_v1_go, globalSize, localSize, iters); timed = (timed_lo < timed_go)? timed_lo: timed_go; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; std::cout << gbps << std::endl; // Vector width 2 std::cout << "float2 : "; globalSize = (numItems / 2 / fetchPerWI); timed_lo = run_kernel(queue, kernel_v2_lo, globalSize, localSize, iters); timed_go = run_kernel(queue, kernel_v2_go, globalSize, localSize, iters); timed = (timed_lo < timed_go)? timed_lo: timed_go; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; std::cout << gbps << std::endl; // Vector width 4 std::cout << "float4 : "; globalSize = (numItems / 4 / fetchPerWI); timed_lo = run_kernel(queue, kernel_v4_lo, globalSize, localSize, iters); timed_go = run_kernel(queue, kernel_v4_go, globalSize, localSize, iters); timed = (timed_lo < timed_go)? timed_lo: timed_go; gbps = ((float)numItems * sizeof(float)) / timed / 1e3f; std::cout << gbps << std::endl; delete [] arr; } int main(int argc, char **argv) { // Get platforms vector<cl::Platform> platforms; cl::Platform::get(&platforms); // Iterate all platforms for (int p = 0; p < platforms.size(); p++) { cl::Platform platform = platforms[p]; std::cout << "Platform: " << platform.getInfo<CL_PLATFORM_NAME>() << std::endl; // Get context cl::Context ctx(CL_DEVICE_TYPE_ALL); // Get devices vector<cl::Device> devices = ctx.getInfo<CL_CONTEXT_DEVICES>(); // Iterate all devices for (int d = 0; d < devices.size(); d++) { cl::Device device = devices[d]; device_info_t devInfo = getDeviceInfo(device); std::cout << "Device: " << devInfo.deviceName << std::endl; std::cout << "Driver version : " << devInfo.driverVersion << std::endl; std::cout << "Compute units : " << devInfo.numCUs << std::endl; std::cout << "Clock frequency : " << devInfo.maxClockFreq << " MHz" << std::endl; std::cout << std::endl; // Get command queue cl::CommandQueue queue = cl::CommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE); // Parameters int iters = ITERATIONS; try { #if BENCH_SP std::cout << "Single-precision compute (GFLOPS)" << std::endl; run_compute_sp(ctx, device, queue, devInfo, iters); std::cout << std::endl; #endif #if BENCH_DP std::cout << "Double-precision compute (GFLOPS)" << std::endl; run_compute_dp(ctx, device, queue, devInfo, iters); std::cout << std::endl; #endif #if BENCH_INT std::cout << "Integer compute (GFLOPS)" << std::endl; run_compute_integer(ctx, device, queue, devInfo, iters); std::cout << std::endl; #endif #if BENCH_BW std::cout << "Global memory bandwidth (GBPS)" << std::endl; run_global_bandwidth(ctx, device, queue, devInfo, iters); std::cout << std::endl; #endif } catch (cl::Error error) { std::cerr << "OpenCL error: " << error.what() << "(" << error.err() << ")" << std::endl; } } } return EXIT_SUCCESS; }
Private
[
?
]
Run code
Submit