OpenCL Example
Platform informations
get_info.cpp
#include <CL/cl.h>
#include <cstdlib>
#include <iostream>
using namespace std;
int main(void) {
cl_int status;
size_t str_info_size;
char * str_info;
cl_uint uint_info;
cl_uint num_platforms;
cl_platform_id *platforms;
cl_uint num_devices;
cl_device_id *devices;
status = clGetPlatformIDs(0, NULL, &num_platforms);
if(status != CL_SUCCESS) {
cerr << "Failed to get the number of platforms." << endl;
return -1;
}
if(num_platforms > 0) {
cout << "* The number of platforms: " << num_platforms << endl;
} else {
cerr << "The number of platforms is zero." << endl;
return -1;
}
platforms
= (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
for(int i = 0; i < num_platforms; ++i) {
cout << " -----------------------------------------------" << endl;
cout << " Platform-" << i + 1 << endl;
clGetPlatformInfo(
platforms[i], CL_PLATFORM_NAME, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_NAME, str_info_size, str_info, NULL);
cout << " * Platform: " << str_info << endl;
free(str_info);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_PROFILE, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_PROFILE, str_info_size, str_info, NULL);
cout << " * Profile: " << str_info << endl;
free(str_info);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_VERSION, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetPlatformInfo(
platforms[i], CL_PLATFORM_VERSION, str_info_size, str_info, NULL);
cout << " * " << str_info << endl;
free(str_info);
status = clGetDeviceIDs(
platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
if(status != CL_SUCCESS) {
cerr << "Failed to get the number of devices in the platform "
<< i + 1 << '.' << endl;
continue;
}
if(num_devices > 0) {
cout << " * The number of devices: " << num_devices << endl;
} else {
cerr << "Platform-" << i + 1 << " has no devices." << endl;
continue;
}
devices = (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(
platforms[i], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
for(int j = 0; j < num_devices; ++j) {
cout << " ================================================="
<< endl;
cout << " Device-" << j + 1 << endl;
clGetDeviceInfo(
devices[j], CL_DEVICE_NAME, 0, NULL, &str_info_size);
str_info = (char *)malloc(str_info_size);
clGetDeviceInfo(
devices[j], CL_DEVICE_NAME, str_info_size, str_info, NULL);
cout << " * name: " << str_info << endl;
free(str_info);
clGetDeviceInfo(devices[j],
CL_DEVICE_MAX_COMPUTE_UNITS,
sizeof(uint_info),
&uint_info,
NULL);
cout << " * Max Compute Units: " << uint_info << endl;
clGetDeviceInfo(devices[j],
CL_DEVICE_MAX_CLOCK_FREQUENCY,
sizeof(uint_info),
&uint_info,
NULL);
cout << " * Max Clock freq: " << uint_info << endl;
cout << " ================================================="
<< endl;
}
cout << " -----------------------------------------------" << endl;
}
free(devices);
free(platforms);
return 0;
}
g++ -o get_info get_info.cpp -lOpenCL
SAXPY, CPU vs. GPU or Mali-GPU
- CPU vs. GPU
- CPU vs. Mali GPU
saxpy_cpu_vs_gpu.cpp
#include <CL/cl.h>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <iostream>
using namespace std;
const int LENGTH = 1000000;
const char *source
= "__kernel void saxpy(const __global float *X, \n"
" __global float *Y, \n"
" const float a) { \n"
" uint gid = get_global_id(0); \n"
" Y[gid] = a * X[gid] + Y[gid]; \n"
"} \n";
int main(void) {
cout << fixed;
cl_int status;
time_t t, t2 = 0;
cl_uint num_platforms = 1;
cl_platform_id *platforms
= (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_uint num_devices = 2;
cl_device_id *devices
= (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(
platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_context context
= clCreateContext(NULL, 1, &devices[0], NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue(context, devices[0], 0, NULL);
cl_program program = clCreateProgramWithSource(
context, 1, (const char **)&source, NULL, NULL);
status = clBuildProgram(program, 1, &devices[0], NULL, NULL, NULL);
if(status != CL_SUCCESS) {
cerr << "clBuildProgram failed: " << status << endl;
char log[0x10000];
clGetProgramBuildInfo(
program, devices[0], CL_PROGRAM_BUILD_LOG, 0x10000, log, NULL);
cerr << log << endl;
return -1;
}
cl_kernel kernel = clCreateKernel(program, "saxpy", NULL);
cl_float *p_x = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_y = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_ret = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float *p_cl_ret = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float a = 2.f;
srand(time(NULL));
for(int i = 0; i < LENGTH; ++i) {
p_x[i] = (cl_float)rand() / RAND_MAX;
p_y[i] = (cl_float)rand() / RAND_MAX;
}
cl_mem buf_x = clCreateBuffer(
context, CL_MEM_READ_ONLY, sizeof(cl_float) * LENGTH, NULL, NULL);
cl_mem buf_y = clCreateBuffer(
context, CL_MEM_READ_WRITE, sizeof(cl_float) * LENGTH, NULL, NULL);
t = clock();
for(int i = 0; i < LENGTH; ++i) { p_ret[i] = a * p_x[i] + p_y[i]; }
t = clock() - t;
cout << "CPU" << endl;
cout << " * Time (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << "---------------------------------------------------" << endl;
t = clock();
clEnqueueWriteBuffer(queue,
buf_x,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_x,
0,
NULL,
NULL);
clEnqueueWriteBuffer(queue,
buf_y,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_y,
0,
NULL,
NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_x);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_y);
clSetKernelArg(kernel, 2, sizeof(cl_float), &a);
t = clock() - t;
t2 += t;
cout << "GPU" << endl;
cout << " * Writes buffers and sets arguments (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
t = clock();
clEnqueueNDRangeKernel(
queue, kernel, 1, NULL, (size_t *)&LENGTH, NULL, 0, NULL, NULL);
clFinish(queue);
t = clock() - t;
t2 += t;
cout << " * Runnig a kernel (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
t = clock();
clEnqueueReadBuffer(queue,
buf_y,
CL_TRUE,
0,
sizeof(cl_float) * LENGTH,
p_cl_ret,
0,
NULL,
NULL);
t = clock() - t;
t2 += t;
cout << " * Reads buffer (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << endl
<< " * Total time (s): "
<< (double)t2 / CLOCKS_PER_SEC << endl;
for(int i = 0; i < LENGTH; ++i) {
if(abs(p_ret[i] - p_cl_ret[i]) > 0.01) {
cout << "Failed to SAXPY" << endl;
}
}
clReleaseMemObject(buf_y);
clReleaseMemObject(buf_x);
free(p_cl_ret);
free(p_ret);
free(p_y);
free(p_x);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);
return 0;
}
g++ -o saxpy_cpu_vs_gpu saxpy_cpu_vs_gpu.cpp -lOpenCL
saxpy_cpu_vs_mali.cpp
#include <CL/cl.h>
#include <cstdlib>
#include <cstring>
#include <ctime>
#include <iostream>
using namespace std;
const int LENGTH = 1000000;
const char *source
= "__kernel void saxpy(const __global float *X, \n"
" __global float *Y, \n"
" const float a) { \n"
" uint gid = get_global_id(0); \n"
" Y[gid] = a * X[gid] + Y[gid]; \n"
"} \n";
int main(void) {
cout << fixed;
cl_int status;
time_t t, t2 = 0;
cl_uint num_platforms = 1;
cl_platform_id *platforms
= (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
clGetPlatformIDs(num_platforms, platforms, NULL);
cl_uint num_devices = 2;
cl_device_id *devices
= (cl_device_id *)malloc(sizeof(cl_device_id) * num_devices);
clGetDeviceIDs(
platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
cl_context context
= clCreateContext(NULL, 1, &devices[0], NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueueWithProperties(context, devices[0], 0, NULL);
cl_program program = clCreateProgramWithSource(
context, 1, (const char **)&source, NULL, NULL);
status = clBuildProgram(program, 1, &devices[0], NULL, NULL, NULL);
if(status != CL_SUCCESS) {
cerr << "clBuildProgram failed: " << status << endl;
char log[0x10000];
clGetProgramBuildInfo(
program, devices[0], CL_PROGRAM_BUILD_LOG, 0x10000, log, NULL);
cerr << log << endl;
return -1;
}
cl_kernel kernel = clCreateKernel(program, "saxpy", NULL);
cl_mem buf_x = clCreateBuffer(context,
CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
sizeof(cl_float) * LENGTH,
NULL,
NULL);
cl_mem buf_y = clCreateBuffer(context,
CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
sizeof(cl_float) * LENGTH,
NULL,
NULL);
cl_float *p_cl_x;
cl_float *p_cl_y;
cl_float *p_x;
cl_float *p_y;
cl_float *p_ret = (cl_float *)calloc(LENGTH, sizeof(cl_float));
cl_float a = 2.f;
cout << "CPU" << endl;
t2 = 0;
t = clock();
p_x = (cl_float *)calloc(LENGTH, sizeof(cl_float));
p_y = (cl_float *)calloc(LENGTH, sizeof(cl_float));
for(int i = 0; i < LENGTH; ++i) {
p_x[i] = (cl_float)i / LENGTH;
p_y[i] = (cl_float)i / LENGTH;
}
t = clock() - t;
t2 += t;
cout << " * Sets variables (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
t = clock();
for(int i = 0; i < LENGTH; ++i) { p_ret[i] = a * p_x[i] + p_y[i]; }
t = clock() - t;
t2 += t;
cout << " * Runnig a forloop (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << endl
<< " * Total time (s): "
<< (double)t2 / CLOCKS_PER_SEC << endl;
cout << "---------------------------------------------------" << endl;
free(p_x);
free(p_y);
cout << "GPU" << endl;
t2 = 0;
t = clock();
p_cl_x = (cl_float *)clEnqueueMapBuffer(queue,
buf_x,
CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,
sizeof(cl_float) * LENGTH,
0,
NULL,
NULL,
NULL);
p_cl_y = (cl_float *)clEnqueueMapBuffer(queue,
buf_y,
CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,
sizeof(cl_float) * LENGTH,
0,
NULL,
NULL,
NULL);
for(int i = 0; i < LENGTH; ++i) {
p_cl_x[i] = (cl_float)i / LENGTH;
p_cl_y[i] = (cl_float)i / LENGTH;
}
clEnqueueUnmapMemObject(queue, buf_x, p_cl_x, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, buf_y, p_cl_y, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_x);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_y);
clSetKernelArg(kernel, 2, sizeof(cl_float), &a);
t = clock() - t;
t2 += t;
cout << " * Mapping buffer and sets arguments (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
t = clock();
clEnqueueNDRangeKernel(
queue, kernel, 1, NULL, (size_t *)&LENGTH, NULL, 0, NULL, NULL);
clFinish(queue);
t = clock() - t;
t2 += t;
cout << " * Runnig a kernel (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
t = clock();
p_cl_y = (cl_float *)clEnqueueMapBuffer(queue,
buf_y,
CL_TRUE,
CL_MAP_READ | CL_MAP_WRITE,
0,
sizeof(cl_float) * LENGTH,
0,
NULL,
NULL,
NULL);
t = clock() - t;
t2 += t;
cout << " * Mapping buffer (s): "
<< (double)t / CLOCKS_PER_SEC << endl;
cout << endl
<< " * Total time (s): "
<< (double)t2 / CLOCKS_PER_SEC << endl;
for(int i = 0; i < LENGTH; ++i) {
if(abs(p_ret[i] - p_cl_y[i]) > 0.01) {
cout << "Failed to SAXPY" << endl;
break;
}
}
clEnqueueUnmapMemObject(queue, buf_y, p_cl_x, 0, NULL, NULL);
clReleaseMemObject(buf_y);
clReleaseMemObject(buf_x);
free(p_ret);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
free(devices);
free(platforms);
return 0;
}
g++ -o saxpy_cpu_vs_mali saxpy_cpu_vs_mali.cpp -lOpenCL